{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T18:45:36Z","timestamp":1762109136528,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":57,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031729850"},{"type":"electronic","value":"9783031729867"}],"license":[{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72986-7_9","type":"book-chapter","created":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:07:06Z","timestamp":1730437626000},"page":"144-160","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["EraseDraw: Learning to\u00a0Insert Objects by\u00a0Erasing Them from\u00a0Images"],"prefix":"10.1007","author":[{"given":"Alper","family":"Canberk","sequence":"first","affiliation":[]},{"given":"Maksym","family":"Bondarenko","sequence":"additional","affiliation":[]},{"given":"Ege","family":"Ozguroglu","sequence":"additional","affiliation":[]},{"given":"Ruoshi","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Carl","family":"Vondrick","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,2]]},"reference":[{"key":"9_CR1","unstructured":"Amit, T., Shaharbany, T., Nachmani, E., Wolf, L.: Segdiff: image segmentation with diffusion probabilistic models. arXiv preprint arXiv:2112.00390 (2021)"},{"key":"9_CR2","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., Fried, O.: Blended diffusion for text-driven editing of natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18208\u201318218 (2022)","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"9_CR3","unstructured":"Avrahami, O., Tamir, G., Benaim, S., Fried, O.: Emu edit: multi-task instruction-guided image editing with learned task embeddings. arXiv preprint arXiv:2304.04384 (2023)"},{"key":"9_CR4","doi-asserted-by":"publisher","first-page":"2629","DOI":"10.1007\/s11263-020-01336-9","volume":"128","author":"S Azadi","year":"2020","unstructured":"Azadi, S., Pathak, D., Ebrahimi, S., Darrell, T.: Compositional GAN: learning image-conditional binary composition. Int. J. Comput. Vision 128, 2629\u20132642 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"9_CR5","doi-asserted-by":"crossref","unstructured":"Bar-Tal, O., Ofri-Amar, D., Fridman, R., Kasten, Y., Ullman, S., Dekel, T.: Text2live: text-driven layered image and video editing. arXiv preprint arXiv:2204.02491 (2022)","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"9_CR6","unstructured":"Blattmann, A., et al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"9_CR7","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402 (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"9_CR9","unstructured":"Brooks, T., et al.: Video generation models as world simulators (2024). https:\/\/openai.com\/research\/video-generation-models-as-world-simulators"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., Luo, P.: Diffusiondet: diffusion model for object detection. arXiv preprint arXiv:2211.09788 (2022)","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"9_CR11","unstructured":"Choi, W., Chao, Y.W., Pantofaru, C., Savarese, S.: Context-driven 3D scene understanding from a single image. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2688\u20132695 (2012)"},{"key":"9_CR12","unstructured":"Deitke, M., et al.: Objaverse-XL: a universe of 10M+ 3D objects. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"9_CR13","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"issue":"6","key":"9_CR14","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2366145.2366154","volume":"31","author":"M Fisher","year":"2012","unstructured":"Fisher, M., Ritchie, D., Savva, M., Funkhouser, T., Hanrahan, P.: Example-based synthesis of 3D object arrangements. ACM Trans. Graph. (TOG) 31(6), 1\u201311 (2012)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"9_CR15","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"key":"9_CR16","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: NeurIPS (2014)"},{"key":"9_CR17","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"9_CR18","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"9_CR19","doi-asserted-by":"crossref","unstructured":"Huang, X., Belongie, S.: Arbitrary style transfer in real-time with adaptive instance normalization. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1501\u20131510 (2017)","DOI":"10.1109\/ICCV.2017.167"},{"key":"9_CR20","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., Manning, C.D.: GQA: a new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709 (2019)","DOI":"10.1109\/CVPR.2019.00686"},{"key":"9_CR21","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1125\u20131134 (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"9_CR22","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410 (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"9_CR23","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., Aila, T.: Analyzing and improving the image quality of stylegan. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8110\u20138119 (2020)","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"9_CR24","unstructured":"Kawar, B., Lang, O., Tov, O., Irani, M.: Imagen editor: text-based selection and editing of images. arXiv preprint arXiv:2211.15481 (2022)"},{"key":"9_CR25","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18215\u201318224 (2022)"},{"key":"9_CR26","unstructured":"Kingma, D.P., Welling, M.: Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)"},{"key":"9_CR27","unstructured":"Lee, J.Y., Tseng, Z., Abbeel, P.: Relaxed placement: learning to synthesize compositional scene layouts with object relations. In: Computer Vision and Pattern Recognition (CVPR) (2022)"},{"key":"9_CR28","doi-asserted-by":"crossref","unstructured":"Li, A.C., Prabhudesai, M., Duggal, S., Brown, E., Pathak, D.: Your diffusion model is secretly a zero-shot classifier. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 2206\u20132217 (2023)","DOI":"10.1109\/ICCV51070.2023.00210"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Urtasun, R.: Holistic scene understanding for 3D object detection with RGBD cameras. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1417\u20131424 (2013)","DOI":"10.1109\/ICCV.2013.179"},{"key":"9_CR30","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3D object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"9_CR31","unstructured":"Liu, Y., et al.: Syncdreamer: learning to generate multiview-consistent images from a single-view image. arXiv preprint arXiv:2309.03453 (2023)"},{"key":"9_CR32","unstructured":"Meng, C., Song, Y., Song, J., Wu, J., Zhu, J.Y., Ermon, S.: Sdedit: image synthesis and editing with stochastic differential equations. In: International Conference on Learning Representations (2021)"},{"key":"9_CR33","unstructured":"Michel, O., Bhattad, A., VanderBilt, E., Krishna, R., Kembhavi, A., Gupta, T.: Object 3DIT: language-guided 3D-aware image editing. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"9_CR34","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., Cohen-Or, D.: Null-text inversion for editing real images using guided diffusion models. arXiv preprint arXiv:2211.09794 (2022)","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"9_CR35","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"9_CR36","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"9_CR37","unstructured":"OSU NLP Group: Hive-magicbrush model checkpoint (2023). https:\/\/huggingface.co\/osunlp\/HIVE-MagicBrush\/resolve\/main\/MagicBrush-epoch-000130.ckpt. Accessed 21 Mar 2024"},{"key":"9_CR38","doi-asserted-by":"crossref","unstructured":"Ozguroglu, E., et al.: pix2gestalt: amodal segmentation by synthesizing wholes. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00377"},{"key":"9_CR39","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"9_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"9_CR41","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"9_CR42","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487 (2022)"},{"key":"9_CR43","unstructured":"Sang, Y., et al.: Image sculpting: interactive image editing using 3D geometric operations. arXiv preprint arXiv:2303.13786 (2023)"},{"key":"9_CR44","doi-asserted-by":"crossref","unstructured":"Sargent, K., et al.: ZeroNVS: zero-shot 360-degree view synthesis from a single real image. arXiv preprint arXiv:2310.17994 (2023)","DOI":"10.1109\/CVPR52733.2024.00900"},{"key":"9_CR45","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. In: NeurIPS (2022)"},{"key":"9_CR46","doi-asserted-by":"crossref","unstructured":"Shen, Y., Gu, J., Tang, X., Zhou, B.: Interpreting the latent space of GANs for semantic face editing. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9243\u20139252 (2020)","DOI":"10.1109\/CVPR42600.2020.00926"},{"key":"9_CR47","doi-asserted-by":"crossref","unstructured":"Sheynin, S., et al.: Emu edit: precise image editing via recognition and generation tasks. arXiv preprint arXiv:2311.10089 (2023)","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"9_CR48","unstructured":"Su, D., Yu, C., Frank, B., Guibas, L., Welling, M., Tu, Z.: Edict: exact diffusion inversion via coupled transformations. arXiv preprint arXiv:2211.12446 (2022)"},{"key":"9_CR49","unstructured":"Su, Z., et al.: Magicbrush: text-to-image editing with a human in the loop. arXiv preprint arXiv:2302.04754 (2023)"},{"key":"9_CR50","unstructured":"Voynov, A., Babenko, A.: Unsupervised discovery of interpretable directions in the GAN latent space. In: International Conference on Machine Learning, pp. 9786\u20139796. PMLR (2020)"},{"key":"9_CR51","unstructured":"Wang, T., et al.: Pretraining is all you need for image-to-image translation. In: European Conference on Computer Vision, pp. 30\u201348. Springer, Cham (2022)"},{"key":"9_CR52","unstructured":"Wu, R., Liu, R., Vondrick, C., Zheng, C.: SIN3DM: learning a diffusion model from a single 3D textured shape. arXiv preprint arXiv:2305.15399 (2023)"},{"key":"9_CR53","doi-asserted-by":"crossref","unstructured":"Xu, J., Liu, S., Vahdat, A., Byeon, W., Wang, X., De\u00a0Mello, S.: Open-Vocabulary Panoptic Segmentation with Text-to-Image Diffusion Models. arXiv preprint arXiv:2303.04803 (2023)","DOI":"10.1109\/CVPR52729.2023.00289"},{"key":"9_CR54","doi-asserted-by":"crossref","unstructured":"Zhan, G., Zheng, C., Xie, W., Zisserman, A.: Amodal ground truth and completion in the wild. CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.02645"},{"key":"9_CR55","doi-asserted-by":"crossref","unstructured":"Zhang, C., Wang, L., Yang, R.: Scene design by integrating geometry and physics for realistic image synthesis. In: Computer Graphics Forum, vol.\u00a033, pp. 61\u201370. Wiley Online Library (2014)","DOI":"10.1111\/cgf.12402"},{"key":"9_CR56","unstructured":"Zhao, W.H., et al.: Image-based contextual advertisement recommendation. In: Proceedings of the 34th International ACM SIGIR Conference on Research and Development in Information Retrieval, pp. 821\u2013830 (2011)"},{"key":"9_CR57","doi-asserted-by":"crossref","unstructured":"Zhu, J.Y., Park, T., Isola, P., Efros, A.A.: Unpaired image-to-image translation using cycle-consistent adversarial networks. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 2223\u20132232 (2017)","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72986-7_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:07:52Z","timestamp":1730437672000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72986-7_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,2]]},"ISBN":["9783031729850","9783031729867"],"references-count":57,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72986-7_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,2]]},"assertion":[{"value":"2 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}