{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T11:08:48Z","timestamp":1780484928447,"version":"3.54.1"},"reference-count":80,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T00:00:00Z","timestamp":1776729600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T00:00:00Z","timestamp":1776729600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100012166","name":"National Key R&D Program of China","doi-asserted-by":"crossref","award":["No.2023YFB4502804"],"award-info":[{"award-number":["No.2023YFB4502804"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"crossref"}]},{"DOI":"10.13039\/501100014219","name":"National Science Fund for Distinguished Young Scholars","doi-asserted-by":"publisher","award":["No.62025603"],"award-info":[{"award-number":["No.62025603"]}],"id":[{"id":"10.13039\/501100014219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. U22B2051, No. U21B2037, No. 62072389, No. 62302411, No. 624B2118"],"award-info":[{"award-number":["No. U22B2051, No. U21B2037, No. 62072389, No. 62302411, No. 624B2118"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Natural Science Foundation of Fujian Province of China","award":["No.2021J06003"],"award-info":[{"award-number":["No.2021J06003"]}]},{"DOI":"10.13039\/501100002858","name":"China Postdoctoral Science Foundation","doi-asserted-by":"publisher","award":["No. 2023M732948"],"award-info":[{"award-number":["No. 2023M732948"]}],"id":[{"id":"10.13039\/501100002858","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1007\/s11263-026-02849-5","type":"journal-article","created":{"date-parts":[[2026,4,21]],"date-time":"2026-04-21T09:04:38Z","timestamp":1776762278000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["An Extensive Benchmark for Single-Round and Multi-Round Instruction-Based Image Editing"],"prefix":"10.1007","volume":"134","author":[{"given":"Yiwei","family":"Ma","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ke","family":"Ye","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Weihuang","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiayi","family":"Ji","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3912-9306","authenticated-orcid":false,"given":"Xiaoshuai","family":"Sun","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rongrong","family":"Ji","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2026,4,21]]},"reference":[{"key":"2849_CR1","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F.L., Almeida, D., Altenschmidt, J., Altman, S., & Anadkat, S., et al. (2023). Gpt-4 technical report. arXiv preprint arXiv:2303.08774"},{"key":"2849_CR2","doi-asserted-by":"crossref","unstructured":"Ancuti, C.O., Ancuti, C., Sbert, M., & Timofte, R. (2019). Dense-haze: A benchmark for image dehazing with dense-haze and haze-free images. In: 2019 IEEE International Conference on Image Processing (ICIP), pp. 1014\u20131018 . IEEE","DOI":"10.1109\/ICIP.2019.8803046"},{"key":"2849_CR3","doi-asserted-by":"crossref","unstructured":"Avrahami, O., Lischinski, D., & Fried, O. (2022). Blended diffusion for text-driven editing of natural images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18208\u201318218","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"2849_CR4","unstructured":"Bai, J., Bai, S., Yang, S., Wang, S., Tan, S., Wang, P., Lin, J., Zhou, C., & Zhou, J. (2023). Qwen-vl: A frontier large vision-language model with versatile abilities. arXiv preprint arXiv:2308.12966"},{"key":"2849_CR5","unstructured":"Bai, S., Cai, Y., Chen, R., Chen, K., Chen, X., Cheng, Z., Deng, L., Ding, W., Gao, C., & Ge, C., et al. (2025). Qwen3-vl technical report. arXiv preprint arXiv:2511.21631"},{"key":"2849_CR6","unstructured":"Basu, S., Saberi, M., Bhardwaj, S., Chegini, A.M., Massiceti, D., Sanjabi, M., Hu, S.X., & Feizi, S. (2023). Editval: Benchmarking diffusion based text-guided image editing methods. arXiv preprint arXiv:2310.02426"},{"key":"2849_CR7","unstructured":"Batifol, S., Blattmann, A., Boesel, F., Consul, S., Diagne, C., Dockhorn, T., English, J., English, Z., Esser, P., & Kulal, S., et al. (2025). Flux. 1 kontext: Flow matching for in-context image generation and editing in latent space. arXiv e-prints, 2506"},{"key":"2849_CR8","doi-asserted-by":"crossref","unstructured":"Bigham, J.P., Jayant, C., Ji, H., Little, G., Miller, A., Miller, R.C., Miller, R., Tatarowicz, A., White, B., & White, S., et al. (2010). Vizwiz: nearly real-time answers to visual questions. In: Proceedings of the 23nd Annual ACM Symposium on User Interface Software and Technology, pp. 333\u2013342","DOI":"10.1145\/1866029.1866080"},{"key":"2849_CR9","doi-asserted-by":"crossref","unstructured":"Brack, M., Friedrich, F., Kornmeier, K., Tsaban, L., Schramowski, P., Kersting, K., & Passos, A. (2024). Ledits++: Limitless image editing using text-to-image models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8861\u20138870","DOI":"10.1109\/CVPR52733.2024.00846"},{"key":"2849_CR10","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., & Efros, A.A. (2023). Instructpix2pix: Learning to follow image editing instructions. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18392\u201318402","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"2849_CR11","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. (2020). Language models are few-shot learners. Advances in neural information processing systems, 33, 1877\u20131901.","journal-title":"Advances in neural information processing systems"},{"key":"2849_CR12","doi-asserted-by":"crossref","unstructured":"Chen, W.-T., Fang, H.-Y., Hsieh, C.-L., Tsai, C.-C., Chen, I., Ding, J.-J., & Kuo, S.-Y., et al. (2021). All snow removed: Single image desnowing algorithm using hierarchical dual-tree complex wavelet representation and contradict channel loss. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4196\u20134205","DOI":"10.1109\/ICCV48922.2021.00416"},{"key":"2849_CR13","doi-asserted-by":"crossref","unstructured":"Chen, H., Gu, J., Liu, Y., Magid, S.A., Dong, C., Wang, Q., Pfister, H., & Zhu, L. (2023). Masked image training for generalizable deep image denoising. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1692\u20131703","DOI":"10.1109\/CVPR52729.2023.00169"},{"key":"2849_CR14","doi-asserted-by":"crossref","unstructured":"Chen, X., Li, H., Li, M., & Pan, J. (2023). Learning a sparse transformer network for effective image deraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5896\u20135905","DOI":"10.1109\/CVPR52729.2023.00571"},{"key":"2849_CR15","unstructured":"Chu, X., Qiao, L., Zhang, X., Xu, S., Wei, F., Yang, Y., Sun, X., Hu, Y., Lin, X., & Zhang, B., et al. (2024). Mobilevlm v2: Faster and stronger baseline for vision language model. arXiv preprint arXiv:2402.03766"},{"key":"2849_CR16","unstructured":"Dong, X., Zhang, P., Zang, Y., Cao, Y., Wang, B., Ouyang, L., Wei, X., Zhang, S., Duan, H., & Cao, M., et al. (2024). Internlm-xcomposer2: Mastering free-form text-image composition and comprehension in vision-language large model. arXiv preprint arXiv:2401.16420"},{"key":"2849_CR17","unstructured":"Fei, H., Wu, S., Ji, W., Zhang, H., Zhang, M., Lee, M.-L., & Hsu, W. (2024). Video-of-thought: Step-by-step video reasoning from perception to cognition. In: Forty-first International Conference on Machine Learning"},{"key":"2849_CR18","doi-asserted-by":"crossref","unstructured":"Fei, H., Wu, S., Zhang, M., Zhang, M., Chua, T.-S., & Yan, S. (2024). Enhancing video-language representations with structural spatio-temporal alignment. IEEE Transactions on Pattern Analysis and Machine Intelligence","DOI":"10.1109\/TPAMI.2024.3393452"},{"key":"2849_CR19","volume-title":"VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting","author":"H Fei","year":"2024","unstructured":"Fei, H., Wu, S., Zhang, H., Chua, T.-S., & Yan, S. (2024). VITRON: A Unified Pixel-level Vision LLM for Understanding, Generating, Segmenting. CoRR: Editing."},{"key":"2849_CR20","unstructured":"Fu, T.-J., Hu, W., Du, X., Wang, W.Y., Yang, Y., & Gan, Z. (2024). Guiding instruction-based image editing via multimodal large language models. International Conference on Learning Representations"},{"key":"2849_CR21","unstructured":"Gao, P., Zhang, R., Liu, C., Qiu, L., Huang, S., Lin, W., Zhao, S., Geng, S., Lin, Z., & Jin, P., et al. (2024). Sphinx-x: Scaling data and parameters for a family of multi-modal large language models. arXiv preprint arXiv:2402.05935"},{"key":"2849_CR22","doi-asserted-by":"crossref","unstructured":"Geng, Z., Yang, B., Hang, T., Li, C., Gu, S., Zhang, T., Bao, J., Zhang, Z., Hu, H., & Chen, D., et al. (2023). Instructdiffusion: A generalist modeling interface for vision tasks. arXiv preprint arXiv:2309.03895","DOI":"10.1109\/CVPR52733.2024.01208"},{"key":"2849_CR23","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. Advances in neural information processing systems 27"},{"key":"2849_CR24","doi-asserted-by":"crossref","unstructured":"Guo, L., Wang, C., Yang, W., Huang, S., Wang, Y., Pfister, H., & Wen, B. (2023). Shadowdiffusion: When degradation prior meets diffusion model for shadow removal. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 14049\u201314058","DOI":"10.1109\/CVPR52729.2023.01350"},{"key":"2849_CR25","doi-asserted-by":"crossref","unstructured":"Guo, Y., Xiao, X., Chang, Y., Deng, S., & Yan, L. (2023). From sky to the ground: A large-scale benchmark and simple baseline towards real rain removal. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 12097\u201312107","DOI":"10.1109\/ICCV51070.2023.01111"},{"key":"2849_CR26","unstructured":"Hall, M., Ma\u00f1as, O., Askari-Hemmat, R., Ibrahim, M., Ross, C., Astolfi, P., Ifriqi, T.B., Havasi, M., Benchetrit, Y., & Ullrich, K., et al. (2024). Evalgim: A library for evaluating generative image models. arXiv preprint arXiv:2412.10604"},{"key":"2849_CR27","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems, 33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"key":"2849_CR28","unstructured":"Huang, Y., Huang, J., Liu, Y., Yan, M., Lv, J., Liu, J., Xiong, W., Zhang, H., Chen, S., & Cao, L. (2024). Diffusion model-based image editing: A survey. arXiv preprint arXiv:2402.17525"},{"key":"2849_CR29","doi-asserted-by":"crossref","unstructured":"Huang, Y., Xie, L., Wang, X., Yuan, Z., Cun, X., Ge, Y., Zhou, J., Dong, C., Huang, R., & Zhang, R., et al. (2024). Smartedit: Exploring complex instruction-based image editing with multimodal large language models. Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"2849_CR30","doi-asserted-by":"crossref","unstructured":"Hudson, D.A., & Manning, C.D. (2019). Gqa: A new dataset for real-world visual reasoning and compositional question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6700\u20136709","DOI":"10.1109\/CVPR.2019.00686"},{"key":"2849_CR31","unstructured":"Hui, M., Yang, S., Zhao, B., Shi, Y., Wang, H., Wang, P., Zhou, Y., & Xie, C. (2024). Hq-edit: A high-quality dataset for instruction-based image editing. arXiv preprint arXiv:2404.09990"},{"key":"2849_CR32","doi-asserted-by":"publisher","first-page":"4321","DOI":"10.1109\/TIP.2022.3183434","volume":"31","author":"J Ji","year":"2022","unstructured":"Ji, J., Ma, Y., Sun, X., Zhou, Y., Wu, Y., & Ji, R. (2022). Knowing what to learn: a metric-oriented focal mechanism for image captioning. IEEE Transactions on Image Processing, 31, 4321\u20134335.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2849_CR33","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., & Aila, T. (2019). A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410","DOI":"10.1109\/CVPR.2019.00453"},{"key":"2849_CR34","doi-asserted-by":"crossref","unstructured":"Kawar, B., Zada, S., Lang, O., Tov, O., Chang, H., Dekel, T., Mosseri, I., & Irani, M. (2023). Imagic: Text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"2849_CR35","doi-asserted-by":"crossref","unstructured":"Kong, X., Liu, X., Gu, J., Qiao, Y., & Dong, C. (2022). Reflash dropout in image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6002\u20136012","DOI":"10.1109\/CVPR52688.2022.00591"},{"key":"2849_CR36","doi-asserted-by":"crossref","unstructured":"Korhonen, J., & You, J. (2012). Peak signal-to-noise ratio revisited: Is simple beautiful? In: 2012 Fourth International Workshop on Quality of Multimedia Experience, pp. 37\u201338 . IEEE","DOI":"10.1109\/QoMEX.2012.6263880"},{"key":"2849_CR37","unstructured":"Kulikov, V., Yadin, S., Kleiner, M., & Michaeli, T. (2023). Sinddm: A single image denoising diffusion model. In: International Conference on Machine Learning (ICML), pp. 17920\u201317930 . PMLR"},{"key":"2849_CR38","doi-asserted-by":"crossref","unstructured":"Li, Y., Du, Y., Zhou, K., Wang, J., Zhao, W.X., & Wen, J.-R. (2023). Evaluating object hallucination in large vision-language models. arXiv preprint arXiv:2305.10355","DOI":"10.18653\/v1\/2023.emnlp-main.20"},{"key":"2849_CR39","unstructured":"Li, S., Singh, H., & Grover, A. (2023). Instructany2pix: Flexible visual editing via multimodal instruction following. arXiv preprint arXiv:2312.06738"},{"key":"2849_CR40","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, Y., He, Y., Li, Y., Wang, Y., Liu, Y., Wang, Z., Xu, J., Chen, G., & Luo, P., et al. (2023). Mvbench: A comprehensive multi-modal video understanding benchmark. arXiv preprint arXiv:2311.17005","DOI":"10.1109\/CVPR52733.2024.02095"},{"key":"2849_CR41","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, R., Wang, G., Ge, Y., Ge, Y., & Shan, Y. (2023). Seed-bench: Benchmarking multimodal llms with generative comprehension. arXiv preprint arXiv:2307.16125","DOI":"10.1109\/CVPR52733.2024.01263"},{"key":"2849_CR42","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C.L. (2014). Microsoft coco: Common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755 . Springer","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2849_CR43","doi-asserted-by":"crossref","unstructured":"Liu, H., Li, C., Li, Y., & Lee, Y.J. (2023). Improved Baselines with Visual Instruction Tuning. arXiv:2310.03744","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"2849_CR44","unstructured":"Liu, H., Li, C., Wu, Q., & Lee, Y.J. (2023). Visual Instruction Tuning. NeurIPS"},{"key":"2849_CR45","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhu, Z., & Bai, X. (2021). Wdnet: Watermark-decomposition network for visible watermark removal. In: IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV), pp. 3685\u20133693","DOI":"10.1109\/WACV48630.2021.00373"},{"key":"2849_CR46","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhu, L., Pei, S., Fu, H., Qin, J., Zhang, Q., Wan, L., & Feng, W. (2021). From synthetic to real: Image dehazing collaborating with unlabeled real data. In: Proceedings of the 29th ACM International Conference on Multimedia, pp. 50\u201358","DOI":"10.1145\/3474085.3475331"},{"key":"2849_CR47","first-page":"2507","volume":"35","author":"P Lu","year":"2022","unstructured":"Lu, P., Mishra, S., Xia, T., Qiu, L., Chang, K.-W., Zhu, S.-C., Tafjord, O., Clark, P., & Kalyan, A. (2022). Learn to explain: Multimodal reasoning via thought chains for science question answering. Advances in Neural Information Processing Systems, 35, 2507\u20132521.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2849_CR48","unstructured":"Ma, Y., Ji, J., Ye, K., Lin, W., Zheng, Y., Zhou, Q., Sun, X., & Ji, R., et al. (2024). I2ebench: A comprehensive benchmark for instruction-based image editing. In: The Thirty-eighth Annual Conference on Neural Information Processing Systems"},{"key":"2849_CR49","unstructured":"Ma, Y., Wang, Z., Sun, X., Lin, W., Zhou, Q., Ji, J., & Ji, R. (2024). Inf-llava: Dual-perspective perception for high-resolution multimodal large language model. arXiv preprint arXiv:2407.16198"},{"key":"2849_CR50","doi-asserted-by":"crossref","unstructured":"Ma, Y., Xu, G., Sun, X., Yan, M., Zhang, J., & Ji, R. (2022). X-clip: End-to-end multi-grained contrastive learning for video-text retrieval. In: Proceedings of the 30th ACM International Conference on Multimedia, pp. 638\u2013647","DOI":"10.1145\/3503161.3547910"},{"key":"2849_CR51","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma, Y., Ji, J., Sun, X., Zhou, Y., & Ji, R. (2023). Towards local visual modeling for image captioning. Pattern Recognition, 138, Article 109420.","journal-title":"Pattern Recognition"},{"key":"2849_CR52","doi-asserted-by":"crossref","unstructured":"Marino, K., Rastegari, M., Farhadi, A., & Mottaghi, R. (2019). Ok-vqa: A visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf Conference on Computer Vision and Pattern Recognition, pp. 3195\u20133204","DOI":"10.1109\/CVPR.2019.00331"},{"key":"2849_CR53","doi-asserted-by":"crossref","unstructured":"Martin, D., Fowlkes, C., Tal, D., & Malik, J. (2001). A database of human segmented natural images and its application to evaluating segmentation algorithms and measuring ecological statistics. In: Proc. 8th Int\u2019l Conf. Computer Vision,2, 416\u2013423.","DOI":"10.1109\/ICCV.2001.937655"},{"key":"2849_CR54","doi-asserted-by":"crossref","unstructured":"Nah, S., Kim, T.H., & Lee, K.M. (2017). Deep multi-scale convolutional neural network for dynamic scene deblurring. In: The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2017.35"},{"key":"2849_CR55","doi-asserted-by":"crossref","unstructured":"Qu, L., Tian, J., He, S., Tang, Y., & Lau, R.W. (2017). Deshadownet: A multi-context embedding deep network for shadow removal. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4067\u20134075","DOI":"10.1109\/CVPR.2017.248"},{"key":"2849_CR56","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., & Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML), pp. 8748\u20138763 . PMLR"},{"key":"2849_CR57","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831 . Pmlr"},{"key":"2849_CR58","unstructured":"Reid, M., Savinov, N., Teplyashin, D., Lepikhin, D., Lillicrap, T., Alayrac, J.-b., Soricut, R., Lazaridou, A., Firat, O., & Schrittwieser, J., et al. (2024). Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context. arXiv preprint arXiv:2403.05530"},{"key":"2849_CR59","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in neural information processing systems, 35, 36479\u201336494.","journal-title":"Advances in neural information processing systems"},{"key":"2849_CR60","doi-asserted-by":"crossref","unstructured":"Sanghvi, Y., Mao, Z., & Chan, S.H. (2023). Structured kernel estimation for photon-limited deconvolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 9863\u20139872","DOI":"10.1109\/CVPR52729.2023.00951"},{"key":"2849_CR61","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2849_CR62","doi-asserted-by":"crossref","unstructured":"Shen, Z., Wang, W., Lu, X., Shen, J., Ling, H., Xu, T., & Shao, L. (2019). Human-aware motion deblurring. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5572\u20135581","DOI":"10.1109\/ICCV.2019.00567"},{"key":"2849_CR63","doi-asserted-by":"crossref","unstructured":"Sheynin, S., Polyak, A., Singer, U., Kirstain, Y., Zohar, A., Ashual, O., Parikh, D., & Taigman, Y. (2023). Emu edit: Precise image editing via recognition and generation tasks. arXiv preprint arXiv:2311.10089","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"2849_CR64","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., & Ganguli, S. (2015). Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning (ICML), pp. 2256\u20132265 . PMLR"},{"key":"2849_CR65","unstructured":"Song, J., Meng, C., & Ermon, S. (2020). Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502"},{"key":"2849_CR66","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., & Azhar, F., et al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2849_CR67","doi-asserted-by":"crossref","unstructured":"Wang, H., Chen, X., Ni, B., Liu, Y., & Liu, J. (2023). Omni aggregation networks for lightweight image super-resolution. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22378\u201322387","DOI":"10.1109\/CVPR52729.2023.02143"},{"key":"2849_CR68","doi-asserted-by":"crossref","unstructured":"Wang, S., Saharia, C., Montgomery, C., Pont-Tuset, J., Noy, S., Pellegrini, S., Onoe, Y., Laszlo, S., Fleet, D.J., & Soricut, R., et al. (2023). Imagen editor and editbench: Advancing and evaluating text-guided image inpainting. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18359\u201318369","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"2849_CR69","unstructured":"Wang, Q., Zhang, B., Birsak, M., & Wonka, P. (2023). Instructedit: Improving automatic masks for diffusion-based image editing with user instructions. arXiv preprint arXiv:2305.18047"},{"issue":"4","key":"2849_CR70","doi-asserted-by":"publisher","first-page":"600","DOI":"10.1109\/TIP.2003.819861","volume":"13","author":"Z Wang","year":"2004","unstructured":"Wang, Z., Bovik, A. C., Sheikh, H. R., & Simoncelli, E. P. (2004). Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing, 13(4), 600\u2013612.","journal-title":"IEEE transactions on image processing"},{"key":"2849_CR71","unstructured":"Wei, C., Wang, W., Yang, W., & Liu, J. (2018). Deep retinex decomposition for low-light enhancement. arXiv preprint arXiv:1808.04560"},{"key":"2849_CR72","unstructured":"Welling, M., & Teh, Y.W. (2011). Bayesian learning via stochastic gradient langevin dynamics. In: Proceedings of the 28th International Conference on Machine Learning (ICML), pp. 681\u2013688"},{"key":"2849_CR73","doi-asserted-by":"crossref","unstructured":"Wu, R.-Q., Duan, Z.-P., Guo, C.-L., Chai, Z., & Li, C. (2023). Ridcp: Revitalizing real image dehazing via high-quality codebook priors. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22282\u201322291","DOI":"10.1109\/CVPR52729.2023.02134"},{"key":"2849_CR74","unstructured":"Wu, C., Li, J., Zhou, J., Lin, J., Gao, K., Yan, K., Yin, S.-m., Bai, S., Xu, X., & Chen, Y., et al. (2025). Qwen-image technical report. arXiv preprint arXiv:2508.02324"},{"key":"2849_CR75","unstructured":"Wu, H., Zhang, Z., Zhang, E., Chen, C., Liao, L., Wang, A., Li, C., Sun, W., Yan, Q., & Zhai, G., et al. (2023). Q-bench: A benchmark for general-purpose foundation models on low-level vision. arXiv preprint arXiv:2309.14181"},{"key":"2849_CR76","unstructured":"Yu, W., Yang, Z., Li, L., Wang, J., Lin, K., Liu, Z., Wang, X., & Wang, L. (2023). Mm-vet: Evaluating large multimodal models for integrated capabilities. arXiv preprint arXiv:2308.02490"},{"key":"2849_CR77","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., & Wang, O. (2018). The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 586\u2013595","DOI":"10.1109\/CVPR.2018.00068"},{"key":"2849_CR78","unstructured":"Zhang, K., Mo, L., Chen, W., Sun, H., & Su, Y. (2024). Magicbrush: A manually annotated dataset for instruction-guided image editing. Advances in Neural Information Processing Systems 36"},{"key":"2849_CR79","doi-asserted-by":"crossref","unstructured":"Zhang, S., Yang, X., Feng, Y., Qin, C., Chen, C.-C., Yu, N., Chen, Z., Wang, H., Savarese, S., & Ermon, S., et al. (2023). Hive: Harnessing human feedback for instructional visual editing. arXiv preprint arXiv:2303.09618","DOI":"10.1109\/CVPR52733.2024.00862"},{"key":"2849_CR80","doi-asserted-by":"crossref","unstructured":"Zhu, D., Tang, X., Han, W., Lu, J., Zhao, Y., Xing, G., Wang, J., & Yin, D. (2024). Vislinginstruct: Elevating zero-shot learning in multi-modal language models with autonomous instruction optimization. arXiv preprint arXiv:2402.07398","DOI":"10.18653\/v1\/2024.naacl-long.117"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02849-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-026-02849-5","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-026-02849-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T10:30:38Z","timestamp":1780482638000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-026-02849-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,21]]},"references-count":80,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2026,5]]}},"alternative-id":["2849"],"URL":"https:\/\/doi.org\/10.1007\/s11263-026-02849-5","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,4,21]]},"assertion":[{"value":"2 May 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"5 April 2026","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 April 2026","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}},{"value":"Code are made publicly available at\n                      \n                      .","order":3,"name":"Ethics","group":{"name":"EthicsHeading","label":"Code availability"}}],"article-number":"240"}}