{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T12:25:10Z","timestamp":1773404710639,"version":"3.50.1"},"reference-count":61,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T00:00:00Z","timestamp":1772841600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T00:00:00Z","timestamp":1772841600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,4]]},"DOI":"10.1007\/s11263-025-02691-1","type":"journal-article","created":{"date-parts":[[2026,3,7]],"date-time":"2026-03-07T08:03:33Z","timestamp":1772870613000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Invert Your Prompt: Editing-Aware Diffusion Inversion"],"prefix":"10.1007","volume":"134","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3383-4349","authenticated-orcid":false,"given":"Yangyang","family":"Xu","sequence":"first","affiliation":[]},{"given":"Wenqi","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Yong","family":"Du","sequence":"additional","affiliation":[]},{"given":"Haiming","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yang","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Jiayuan","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Ping","family":"Luo","sequence":"additional","affiliation":[]},{"given":"Shengfeng","family":"He","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,3,7]]},"reference":[{"key":"2691_CR1","doi-asserted-by":"crossref","unstructured":"Abdal, R., Qin, Y., & Wonka, P. (2019). Image2stylegan: How to embed images into the stylegan latent space? In: CVPR, pp. 4432\u20134441.","DOI":"10.1109\/ICCV.2019.00453"},{"key":"2691_CR2","doi-asserted-by":"crossref","unstructured":"Abdal, R., Qin, Y., & Wonka, P. (2020). Image2stylegan++: How to edit the embedded images? In: CVPR, pp. 8296\u20138305.","DOI":"10.1109\/CVPR42600.2020.00832"},{"issue":"6","key":"2691_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3618322","volume":"42","author":"Y Alaluf","year":"2023","unstructured":"Alaluf, Y., Richardson, E., Metzer, G., & Cohen-Or, D. (2023). A neural space-time representation for text-to-image personalization. ACM TOG, 42(6), 1\u201310.","journal-title":"ACM TOG"},{"key":"2691_CR4","unstructured":"Baranchuk, D., Rubachev, I., Voynov, A., Khrulkov, V., & Babenko, A. (2022). Label-efficient semantic segmentation with diffusion models. In: ICLR ."},{"key":"2691_CR5","doi-asserted-by":"crossref","unstructured":"Cao, M., Wang, X., Qi, Z., Shan, Y., Qie, X., & Zheng, Y. (2023). Masactrl: Tuning-free mutual self-attention control for consistent image synthesis and editing. In: ICCV, pp. 22560\u201322570.","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"2691_CR6","doi-asserted-by":"crossref","unstructured":"Caron, M., Touvron, H., Misra, I., J\u00e9gou, H., Mairal, J., Bojanowski, P., & Joulin, A. (2021). Emerging properties in self-supervised vision transformers. In: CVPR, pp. 9650\u20139660.","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"2691_CR7","doi-asserted-by":"crossref","unstructured":"Chai, W., Guo, X., Wang, G., & Lu, Y. (2023). Stablevideo: Text-driven consistency-aware diffusion video editing. In: ICCV, pp. 23040\u201323050.","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"2691_CR8","doi-asserted-by":"crossref","unstructured":"Chen, S., Sun, P., Song, Y., & Luo, P. (2023). Diffusiondet: Diffusion model for object detection. In: ICCV, pp. 19830\u201319843 .","DOI":"10.1109\/ICCV51070.2023.01816"},{"key":"2691_CR9","unstructured":"Cho, H., Lee, J., Kim, S.B., Oh, T.H., & Jeong, Y. (2024). Noise map guidance: Inversion with spatial context for real image editing. In: ICLR."},{"issue":"7","key":"2691_CR10","first-page":"1967","volume":"30","author":"A Creswell","year":"2018","unstructured":"Creswell, A., & Bharath, A. A. (2018). Inverting the generator of a generative adversarial network. IEEE TNNLS, 30(7), 1967\u20131974.","journal-title":"IEEE TNNLS"},{"key":"2691_CR11","unstructured":"Dhariwal, P., & Nichol, A. (2021). Diffusion models beat gans on image synthesis. In: NeurIPS, pp. 8780\u20138794 ."},{"key":"2691_CR12","doi-asserted-by":"crossref","unstructured":"Dong, W., Xue, S., Duan, X., & Han, S. (2023). Prompt tuning inversion for text-driven image editing using diffusion models. In: ICCV, pp. 7430\u20137440 .","DOI":"10.1109\/ICCV51070.2023.00683"},{"key":"2691_CR13","doi-asserted-by":"crossref","unstructured":"Garibi, D., Patashnik, O., Voynov, A., Averbuch-Elor, H., & Cohen-Or, D. (2024). Renoise: Real image inversion through iterative noising. In: ECCV .","DOI":"10.1007\/978-3-031-72630-9_23"},{"key":"2691_CR14","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., & Dekel, T. (2024). Tokenflow: Consistent diffusion features for consistent video editing. In: ICLR"},{"key":"2691_CR15","doi-asserted-by":"crossref","unstructured":"Han, L., Wen, S., Chen, Q., Zhang, Z., Song, K., Ren, M., Gao, R., Stathopoulos, A., He, X., & Chen, Y., et\u00a0al. (2024). Proxedit: Improving tuning-free real image editing with proximal guidance. In: WACV, pp. 4291\u20134301","DOI":"10.1109\/WACV57701.2024.00424"},{"key":"2691_CR16","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., & Cohen-Or, D. (2023). Prompt-to-prompt image editing with cross attention control. In: ICLR."},{"key":"2691_CR17","unstructured":"Ho, J., & Salimans, T. (2022). Classifier-free diffusion guidance. In: NeurIPS Workshop."},{"key":"2691_CR18","doi-asserted-by":"crossref","unstructured":"Huberman-Spiegelglas, I., Kulikov, V., & Michaeli, T. (2024). An edit friendly ddpm noise space: Inversion and manipulations. In: CVPR.","DOI":"10.1109\/CVPR52733.2024.01185"},{"key":"2691_CR19","doi-asserted-by":"crossref","unstructured":"Ji, Y., Chen, Z., Xie, E., Hong, L., Liu, X., Liu, Z., Lu, T., Li, Z., & Luo, P. (2023). Ddp: Diffusion model for dense visual prediction. In: ICCV, pp. 21741\u201321752","DOI":"10.1109\/ICCV51070.2023.01987"},{"key":"2691_CR20","unstructured":"Ju, X. (2023). Pnpinversion. https:\/\/github.com\/cure-lab\/PnPInversion."},{"key":"2691_CR21","unstructured":"Ju, X., Zeng, A., Bian, Y., Liu, S., & Xu, Q. (2024). Pnp inversion: Boosting diffusion-based editing with 3 lines of code. In: ICLR."},{"key":"2691_CR22","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., Movsisyan, A., Tadevosyan, V., Henschel, R., Wang, Z., Navasardyan, S., & Shi, H. (2023). Text2video-zero: Text-to-image diffusion models are zero-shot video generators. In: ICCV, pp. 15954\u201315964","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"2691_CR23","unstructured":"Kingma, D.P., & Ba, J. (2015). Adam: A method for stochastic optimization. In: ICLR ."},{"key":"2691_CR24","doi-asserted-by":"crossref","unstructured":"Li, R., Li, R., Guo, S., & Zhang, L. (2024). Source prompt disentangled inversion for boosting image editability with diffusion models. In: ECCV.","DOI":"10.1007\/978-3-031-73347-5_23"},{"key":"2691_CR25","unstructured":"Li, S., van\u00a0de Weijer, J., Hu, T., Khan, F.S., Hou, Q., Wang, Y., & Yang, J. (2024). Stylediffusion: Prompt-embedding inversion for text-based editing. CVMJ."},{"key":"2691_CR26","unstructured":"Liew, J.H., Yan, H., Zhou, D., & Feng, J. (2022). Magicmix: Semantic mixing with diffusion models. arXiv preprint arXiv:2210.16056."},{"key":"2691_CR27","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., & Jia, J. (2024). Video-p2p: Video editing with cross-attention control. In: CVPR.","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"2691_CR28","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., & Zhao, H. (2023). Latent consistency models: Synthesizing high-resolution images with few-step inference. arXiv preprint arXiv:2310.04378 ."},{"key":"2691_CR29","unstructured":"Luo, S., Tan, Y., Patil, S., Gu, D., von Platen, P., Passos, A., Huang, L., Li, J., & Zhao, H. (2023). Lcm-lora: A universal stable-diffusion acceleration module. arXiv preprint arXiv:2311.05556 ."},{"key":"2691_CR30","unstructured":"Meiri, B., Samuel, D., Darshan, N., Chechik, G., Avidan, S., & Ben-Ari, R. (2023). Fixed-point inversion for text-to-image diffusion models. arXiv preprint arXiv:2312.12540."},{"key":"2691_CR31","doi-asserted-by":"crossref","unstructured":"Miyake, D., Iohara, A., Saito, Y., & Tanaka, T. (2025). Negative-prompt inversion: Fast image inversion for editing with text-guided diffusion models. In: WACV, pp. 2063\u20132072 .","DOI":"10.1109\/WACV61041.2025.00207"},{"key":"2691_CR32","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., & Cohen-Or, D. (2023). Null-text inversion for editing real images using guided diffusion models. In: CVPR, pp. 6038\u20136047.","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"2691_CR33","doi-asserted-by":"crossref","unstructured":"Pan, Z., Gherardi, R., Xie, X., & Huang, S. (2023). Effective real image editing with accelerated iterative diffusion inversion. In: ICCV, pp. 15912\u201315921.","DOI":"10.1109\/ICCV51070.2023.01458"},{"key":"2691_CR34","doi-asserted-by":"crossref","unstructured":"Parmar, G., Kumar\u00a0Singh, K., Zhang, R., Li, Y., Lu, J., & Zhu, J.Y. (2023). Zero-shot image-to-image translation. In: SIGGRAPH, pp. 1\u201311 .","DOI":"10.1145\/3588432.3591513"},{"key":"2691_CR35","doi-asserted-by":"crossref","unstructured":"Patashnik, O., Garibi, D., Azuri, I., Averbuch-Elor, H., & Cohen-Or, D. (2023). Localizing object-level shape variations with text-to-image diffusion models. In: ICCV, pp. 23051\u201323061 .","DOI":"10.1109\/ICCV51070.2023.02107"},{"key":"2691_CR36","doi-asserted-by":"crossref","unstructured":"Qi, C., Cun, X., Zhang, Y., Lei, C., Wang, X., Shan, Y., & Chen, Q. (2023). Fatezero: Fusing attentions for zero-shot text-based video editing. In: ICCV, pp. 15932\u201315942.","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"2691_CR37","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1(2), 3"},{"key":"2691_CR38","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2691_CR39","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., & Aberman, K. (2023). Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR, pp. 22500\u201322510.","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"2691_CR40","doi-asserted-by":"crossref","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E.L., Ghasemipour, K., Gontijo\u00a0Lopes, R., Karagol\u00a0Ayan, B., & Salimans, T., et\u00a0al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS, vol.\u00a035, pp. 36479\u201336494 .","DOI":"10.52202\/068431-2643"},{"key":"2691_CR41","doi-asserted-by":"crossref","unstructured":"Sauer, A., Lorenz, D., Blattmann, A., & Rombach, R. (2025). Adversarial diffusion distillation. In: ECCV, pp. 87\u2013103 .","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"2691_CR42","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., & Ganguli, S. (2015). Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML, pp. 2256\u20132265 ."},{"key":"2691_CR43","unstructured":"Song, J., Meng, C., & Ermon, S. (2021). Denoising diffusion implicit models. In: ICLR."},{"key":"2691_CR44","unstructured":"Song, Y., Dhariwal, P., Chen, M., & Sutskever, I. (2023). Consistency models. In: ICML."},{"key":"2691_CR45","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Geyer, M., Bagon, S., & Dekel, T. (2023). Plug-and-play diffusion features for text-driven image-to-image translation. In: CVPR, pp. 1921\u20131930","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"2691_CR46","unstructured":"Voynov, A., Chu, Q., Cohen-Or, D., & Aberman, K. (2023). $$ p+ $$: Extended textual conditioning in text-to-image generation. arXiv preprint arXiv:2303.09522."},{"key":"2691_CR47","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., & Naik, N. (2023). Edict: Exact diffusion inversion via coupled transformations. In: CVPR, pp. 22532\u201322541.","DOI":"10.1109\/CVPR52729.2023.02158"},{"issue":"4","key":"2691_CR48","first-page":"600","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: from error visibility to structural similarity. IEEE TIP, 13(4), 600\u2013612.","journal-title":"IEEE TIP"},{"key":"2691_CR49","unstructured":"Wu, C., Huang, L., Zhang, Q., Li, B., Ji, L., Yang, F., Sapiro, G., & Duan, N. (2021). Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806."},{"key":"2691_CR50","doi-asserted-by":"crossref","unstructured":"Wu, C.H., & De\u00a0la Torre, F. (2023). A latent space of stochastic diffusion models for zero-shot image editing and guidance. In: ICCV, pp. 7378\u20137387 .","DOI":"10.1109\/ICCV51070.2023.00678"},{"key":"2691_CR51","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., Ge, Y., Wang, X., Lei, S.W., Gu, Y., Shi, Y., Hsu, W., Shan, Y., Qie, X., & Shou, M.Z. (2023). Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. In: ICCV, pp. 7623\u20137633","DOI":"10.1109\/ICCV51070.2023.00701"},{"issue":"3","key":"2691_CR52","first-page":"3121","volume":"45","author":"W Xia","year":"2023","unstructured":"Xia, W., Zhang, Y., Yang, Y., Xue, J. H., Zhou, B., & Yang, M. H. (2023). Gan inversion: A survey. IEEE TPAMI, 45(3), 3121\u20133138.","journal-title":"Gan inversion: A survey. IEEE TPAMI"},{"key":"2691_CR53","doi-asserted-by":"crossref","unstructured":"Xu, C., Xu, Y., Zhang, H., Xu, X., & He, S. (2024). Dreamanime: Learning style-identity textual disentanglement for anime and beyond. IEEE TVCG.","DOI":"10.1109\/TVCG.2024.3397712"},{"key":"2691_CR54","doi-asserted-by":"crossref","unstructured":"Xu, Y., Du, Y., Xiao, W., Xu, X., & He, S. (2021). From continuity to editability: Inverting gans with consecutive images. In: ICCV, pp. 13910\u201313918 .","DOI":"10.1109\/ICCV48922.2021.01365"},{"key":"2691_CR55","doi-asserted-by":"crossref","unstructured":"Xu, Y., He, S., Wong, K.Y.K., & Luo, P. (2023). Rigid: Recurrent gan inversion and editing of real face videos. In: ICCV, pp. 13691\u201313701 .","DOI":"10.1109\/ICCV51070.2023.01259"},{"key":"2691_CR56","unstructured":"Xue, Z., Song, G., Guo, Q., Liu, B., Zong, Z., Liu, Y., & Luo, P. (2024). Raphael: Text-to-image generation via large mixture of diffusion paths. In: NeurIPS, vol.\u00a036 ."},{"key":"2691_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, G., Lewis, J.P., & Kleijn, W.B. (2024). Exact diffusion inversion via bidirectional integration approximation. In: ECCV, pp. 19\u201336 .","DOI":"10.1007\/978-3-031-72998-0_2"},{"key":"2691_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., & Wang, O. (2018). The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR .","DOI":"10.1109\/CVPR.2018.00068"},{"key":"2691_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Dong, W., Tang, F., Huang, N., Huang, H., Ma, C., Lee, T. Y., Deussen, O., & Xu, C. (2023). Prospect: Prompt spectrum for attribute-aware personalization of diffusion models. ACM TOG, 42(6), 1\u201314.","DOI":"10.1145\/3618342"},{"key":"2691_CR60","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., & Tian, Q. (2024). Controlvideo: Training-free controllable text-to-video generation. In: ICLR ."},{"key":"2691_CR61","doi-asserted-by":"crossref","unstructured":"Zhao, W., Rao, Y., Liu, Z., Liu, B., Zhou, J., & Lu, J. (2023). Unleashing text-to-image diffusion models for visual perception. In: ICCV, pp. 5729\u20135739 .","DOI":"10.1109\/ICCV51070.2023.00527"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02691-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02691-1","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02691-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,13]],"date-time":"2026-03-13T09:38:37Z","timestamp":1773394717000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02691-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,7]]},"references-count":61,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2026,4]]}},"alternative-id":["2691"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02691-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,3,7]]},"assertion":[{"value":"7 December 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 September 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 March 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 March 2026","order":5,"name":"change_date","label":"Change Date","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Update","order":6,"name":"change_type","label":"Change Type","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The original version of this article is revised due to update in corresponding author\u2019s email address.","order":7,"name":"change_details","label":"Change Details","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"163"}}