{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T10:41:23Z","timestamp":1779360083401,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172393"],"award-info":[{"award-number":["62172393"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Public Welfare Project of Henan Province","award":["201300311200"],"award-info":[{"award-number":["201300311200"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755811","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T06:47:18Z","timestamp":1761374838000},"page":"10622-10631","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["MIGE: Mutually Enhanced Multimodal Instruction-Based Image Generation and Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4867-3637","authenticated-orcid":false,"given":"Xueyun","family":"Tian","sequence":"first","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4464-2446","authenticated-orcid":false,"given":"Wei","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-8319-2681","authenticated-orcid":false,"given":"Bingbing","family":"Xu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8856-668X","authenticated-orcid":false,"given":"Yige","family":"Yuan","sequence":"additional","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China and University of Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6799-1756","authenticated-orcid":false,"given":"Yuanzhuo","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1081-8119","authenticated-orcid":false,"given":"Huawei","family":"Shen","sequence":"additional","affiliation":[{"name":"State Key Laboratory of AI Safety, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_2_1","unstructured":"Junsong Chen Jincheng Yu Chongjian Ge Lewei Yao Enze Xie Yue Wu Zhongdao Wang James Kwok Ping Luo Huchuan Lu et al. 2023. Pixart-?: Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2669"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"e_1_3_2_1_5_1","volume-title":"Qing Liu, Yijun Li, Jianming Zhang, Nanxuan Zhao, Yilin Wang, et al.","author":"Chen Xi","year":"2024","unstructured":"Xi Chen, Zhifei Zhang, He Zhang, Yuqian Zhou, Soo Ye Kim, Qing Liu, Yijun Li, Jianming Zhang, Nanxuan Zhao, Yilin Wang, et al., 2024c. UniReal: Universal Image Generation and Editing via Learning Real-world Dynamics. arXiv preprint arXiv:2412.07774 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Improving Diffusion Models for Authentic Virtual Try-on in the Wild. arXiv preprint arXiv:2403.05139","author":"Choi Yisol","year":"2024","unstructured":"Yisol Choi, Sangkyung Kwak, Kyungmin Lee, Hyungwon Choi, and Jinwoo Shin. 2024. Improving Diffusion Models for Authentic Virtual Try-on in the Wild. arXiv preprint arXiv:2403.05139 (2024)."},{"key":"e_1_3_2_1_7_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al., 2024. Scaling instruction-finetuned language models. Journal of Machine Learning Research, Vol. 25, 70 (2024), 1-53.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_8_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et al., 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01855"},{"key":"e_1_3_2_1_10_1","volume-title":"Yinfei Yang, and Zhe Gan.","author":"Fu Tsu-Jui","year":"2023","unstructured":"Tsu-Jui Fu, Wenze Hu, Xianzhi Du, William Yang Wang, Yinfei Yang, and Zhe Gan. 2023. Guiding instruction-based image editing via multimodal large language models. arXiv preprint arXiv:2309.17102 (2023)."},{"key":"e_1_3_2_1_11_1","unstructured":"Yuying Ge Sijie Zhao Chen Li Yixiao Ge and Ying Shan. 2024. SEED-Data-Edit Technical Report: A Hybrid Dataset for Instructional Image Editing. arXiv preprint arXiv:2405.04007 (2024)."},{"key":"e_1_3_2_1_12_1","volume-title":"Proceedings of the fourteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 315-323","author":"Glorot Xavier","year":"2011","unstructured":"Xavier Glorot, Antoine Bordes, and Yoshua Bengio. 2011. Deep sparse rectifier neural networks. In Proceedings of the fourteenth international conference on artificial intelligence and statistics. JMLR Workshop and Conference Proceedings, 315-323."},{"key":"e_1_3_2_1_13_1","volume-title":"ACE: All-round Creator and Editor","author":"Han Zhen","year":"2024","unstructured":"Zhen Han, Zeyinzi Jiang, Yulin Pan, Jingfeng Zhang, Chaojie Mao, Chenwei Xie, Yu Liu, and Jingren Zhou. 2024. ACE: All-round Creator and Editor Following Instructions via Diffusion Transformer. arXiv preprint arXiv:2410.00086 (2024)."},{"key":"e_1_3_2_1_14_1","volume-title":"Affordance-Aware Object Insertion via Mask-Aware Dual Diffusion. arXiv preprint arXiv:2412.14462","author":"He Jixuan","year":"2024","unstructured":"Jixuan He, Wanhua Li, Ye Liu, Junsik Kim, Donglai Wei, and Hanspeter Pfister. 2024a. Affordance-Aware Object Insertion via Mask-Aware Dual Diffusion. arXiv preprint arXiv:2412.14462 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Freeedit: Mask-free reference-based image editing with multi-modal instruction. arXiv preprint arXiv:2409.18071","author":"He Runze","year":"2024","unstructured":"Runze He, Kai Ma, Linjiang Huang, Shaofei Huang, Jialin Gao, Xiaoming Wei, Jiao Dai, Jizhong Han, and Si Liu. 2024b. Freeedit: Mask-free reference-based image editing with multi-modal instruction. arXiv preprint arXiv:2409.18071 (2024)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00455"},{"key":"e_1_3_2_1_17_1","unstructured":"Lianghua Huang Wei Wang Zhi-Fan Wu Huanzhang Dou Yupeng Shi Yutong Feng Chen Liang Yu Liu and Jingren Zhou. 2024a. Group diffusion transformers are unsupervised multitask learners. (2024)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"e_1_3_2_1_19_1","volume-title":"WeGen: A Unified Model for Interactive Multimodal Generation as We Chat. arXiv preprint arXiv:2503.01115","author":"Huang Zhipeng","year":"2025","unstructured":"Zhipeng Huang, Shaobin Zhuang, Canmiao Fu, Binxin Yang, Ying Zhang, Chong Sun, Zhizheng Zhang, Yali Wang, Chen Li, and Zheng-Jun Zha. 2025. WeGen: A Unified Model for Interactive Multimodal Generation as We Chat. arXiv preprint arXiv:2503.01115 (2025)."},{"key":"e_1_3_2_1_20_1","unstructured":"Aaron Hurst Adam Lerer Adam P Goucher Adam Perelman Aditya Ramesh Aidan Clark AJ Ostrow Akila Welihinda Alan Hayes Alec Radford et al. 2024. Gpt-4o system card. arXiv preprint arXiv:2410.21276 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Min Jin Chong, and Xin Lu","author":"Kang Hao","year":"2025","unstructured":"Hao Kang, Stathi Fotiadis, Liming Jiang, Qing Yan, Yumin Jia, Zichuan Liu, Min Jin Chong, and Xin Lu. 2025. Flux Already Knows-Activating Subject-Driven Image Generation without Training. arXiv preprint arXiv:2504.11478 (2025)."},{"key":"e_1_3_2_1_22_1","volume-title":"Learning to Customize Text-to-Image Diffusion In Diverse Context. arXiv preprint arXiv:2410.10058","author":"Kim Taewook","year":"2024","unstructured":"Taewook Kim, Wei Chen, and Qiang Qiu. 2024. Learning to Customize Text-to-Image Diffusion In Diverse Context. arXiv preprint arXiv:2410.10058 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_2_1_25_1","unstructured":"Black Forest Labs. 2023. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_2_1_26_1","volume-title":"One Diffusion to Generate Them All. arXiv preprint arXiv:2411.16318","author":"Le Duong H","year":"2024","unstructured":"Duong H Le, Tuan Pham, Sangho Lee, Christopher Clark, Aniruddha Kembhavi, Stephan Mandt, Ranjay Krishna, and Jiasen Lu. 2024. One Diffusion to Generate Them All. arXiv preprint arXiv:2411.16318 (2024)."},{"key":"e_1_3_2_1_27_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Li Dongxu","year":"2024","unstructured":"Dongxu Li, Junnan Li, and Steven Hoi. 2024a. Blip-diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_28_1","volume-title":"International conference on machine learning. PMLR","author":"Li Junnan","year":"2023","unstructured":"Junnan Li, Dongxu Li, Silvio Savarese, and Steven Hoi. 2023b. Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. In International conference on machine learning. PMLR, 19730-19742."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_14"},{"key":"e_1_3_2_1_30_1","volume-title":"Dreamedit: Subject-driven image editing. arXiv preprint arXiv:2306.12624","author":"Li Tianle","year":"2023","unstructured":"Tianle Li, Max Ku, Cong Wei, and Wenhu Chen. 2023a. Dreamedit: Subject-driven image editing. arXiv preprint arXiv:2306.12624 (2023)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.335"},{"key":"e_1_3_2_1_32_1","volume-title":"Tokenpacker: Efficient visual projector for multimodal llm. arXiv preprint arXiv:2407.02392","author":"Li Wentong","year":"2024","unstructured":"Wentong Li, Yuqian Yuan, Jian Liu, Dongqi Tang, Song Wang, Jie Qin, Jianke Zhu, and Lei Zhang. 2024d. Tokenpacker: Efficient visual projector for multimodal llm. arXiv preprint arXiv:2407.02392 (2024)."},{"key":"e_1_3_2_1_33_1","volume-title":"Blobctrl: A unified and flexible framework for element-level image generation and editing. arXiv preprint arXiv:2503.13434","author":"Li Yaowei","year":"2025","unstructured":"Yaowei Li, Lingen Li, Zhaoyang Zhang, Xiaoyu Li, Guangzhi Wang, Hongxiang Li, Xiaodong Cun, Ying Shan, and Yuexian Zou. 2025b. Blobctrl: A unified and flexible framework for element-level image generation and editing. arXiv preprint arXiv:2503.13434 (2025)."},{"key":"e_1_3_2_1_34_1","volume-title":"Unifiedmllm: Enabling unified representation for multi-modal multi-tasks with large language model. arXiv preprint arXiv:2408.02503","author":"Li Zhaowei","year":"2024","unstructured":"Zhaowei Li, Wei Wang, YiQing Cai, Xu Qi, Pengyu Wang, Dong Zhang, Hang Song, Botian Jiang, Zhida Huang, and Tao Wang. 2024b. Unifiedmllm: Enabling unified representation for multi-modal multi-tasks with large language model. arXiv preprint arXiv:2408.02503 (2024)."},{"key":"e_1_3_2_1_35_1","volume-title":"VisualCloze: A Universal Image Generation Framework via Visual In-Context Learning. arXiv preprint arXiv:2504.07960","author":"Li Zhong-Yu","year":"2025","unstructured":"Zhong-Yu Li, Ruoyi Du, Juncheng Yan, Le Zhuo, Zhen Li, Peng Gao, Zhanyu Ma, and Ming-Ming Cheng. 2025a. VisualCloze: A Universal Image Generation Framework via Visual In-Context Learning. arXiv preprint arXiv:2504.07960 (2025)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01728"},{"key":"e_1_3_2_1_37_1","volume-title":"PixWizard: Versatile image-to-image visual assistant with open-language instructions. arXiv preprint arXiv:2409.15278","author":"Lin Weifeng","year":"2024","unstructured":"Weifeng Lin, Xinyu Wei, Renrui Zhang, Le Zhuo, Shitian Zhao, Siyuan Huang, Junlin Xie, Yu Qiao, Peng Gao, and Hongsheng Li. 2024. PixWizard: Versatile image-to-image visual assistant with open-language instructions. arXiv preprint arXiv:2409.15278 (2024)."},{"key":"e_1_3_2_1_38_1","volume-title":"RealGeneral: Unifying Visual Generation via Temporal In-Context Learning with Video Models. arXiv preprint arXiv:2503.10406","author":"Lin Yijing","year":"2025","unstructured":"Yijing Lin, Mengqi Huang, Shuhan Zhuang, and Zhendong Mao. 2025. RealGeneral: Unifying Visual Generation via Temporal In-Context Learning with Video Models. arXiv preprint arXiv:2503.10406 (2025)."},{"key":"e_1_3_2_1_39_1","volume-title":"Kosmos-g: Generating images in context with multimodal large language models. arXiv preprint arXiv:2310.02992","author":"Pan Xichen","year":"2023","unstructured":"Xichen Pan, Li Dong, Shaohan Huang, Zhiliang Peng, Wenhu Chen, and Furu Wei. 2023. Kosmos-g: Generating images in context with multimodal large language models. arXiv preprint arXiv:2310.02992 (2023)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_41_1","volume-title":"Dreambench: A human-aligned benchmark for personalized image generation. arXiv preprint arXiv:2406.16855","author":"Peng Yuang","year":"2024","unstructured":"Yuang Peng, Yuxin Cui, Haomiao Tang, Zekun Qi, Runpei Dong, Jing Bai, Chunrui Han, Zheng Ge, Xiangyu Zhang, and Shu-Tao Xia. 2024. Dreambench: A human-aligned benchmark for personalized image generation. arXiv preprint arXiv:2406.16855 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"International conference on machine learning. PMLR, 8748-8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al., 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748-8763."},{"key":"e_1_3_2_1_43_1","unstructured":"Tianhe Ren Shilong Liu Ailing Zeng Jing Lin Kunchang Li He Cao Jiayu Chen Xinyu Huang Yukang Chen Feng Yan et al. 2024. Grounded sam: Assembling open-world models for diverse visual tasks. arXiv preprint arXiv:2401.14159 (2024)."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"e_1_3_2_1_48_1","volume-title":"Ominicontrol: Minimal and universal control for diffusion transformer. arXiv preprint arXiv:2411.15098","author":"Tan Zhenxiong","year":"2024","unstructured":"Zhenxiong Tan, Songhua Liu, Xingyi Yang, Qiaochu Xue, and Xinchao Wang. 2024. Ominicontrol: Minimal and universal control for diffusion transformer. arXiv preprint arXiv:2411.15098, Vol. 3 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Do We Need to Design Specific Diffusion Models for Different Tasks? Try ONE-PIC. arXiv preprint arXiv:2412.05619","author":"Tao Ming","year":"2024","unstructured":"Ming Tao, Bing-Kun Bao, Yaowei Wang, and Changsheng Xu. 2024. Do We Need to Design Specific Diffusion Models for Different Tasks? Try ONE-PIC. arXiv preprint arXiv:2412.05619 (2024)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i8.32904"},{"key":"e_1_3_2_1_51_1","volume-title":"Omnigen: Unified image generation. arXiv preprint arXiv:2409.11340","author":"Xiao Shitao","year":"2024","unstructured":"Shitao Xiao, Yueze Wang, Junjie Zhou, Huaying Yuan, Xingrun Xing, Ruiran Yan, Shuting Wang, Tiejun Huang, and Zheng Liu. 2024. Omnigen: Unified image generation. arXiv preprint arXiv:2409.11340 (2024)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_2_1_53_1","volume-title":"DreamMix: Decoupling Object Attributes for Enhanced Editability in Customized Image Inpainting. arXiv preprint arXiv:2411.17223","author":"Yang Yicheng","year":"2024","unstructured":"Yicheng Yang, Pengxiang Li, Lu Zhang, Liqian Ma, Ping Hu, Siyu Du, Yunzhi Zhuge, Xu Jia, and Huchuan Lu. 2024. DreamMix: Decoupling Object Attributes for Enhanced Editability in Customized Image Inpainting. arXiv preprint arXiv:2411.17223 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721","author":"Ye Hu","year":"2023","unstructured":"Hu Ye, Jun Zhang, Sibo Liu, Xiao Han, and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)."},{"key":"e_1_3_2_1_55_1","volume-title":"AnyEdit: Mastering Unified High-Quality Image Editing for Any Idea. arXiv preprint arXiv:2411.15738","author":"Yu Qifan","year":"2024","unstructured":"Qifan Yu, Wei Chow, Zhongqi Yue, Kaihang Pan, Yang Wu, Xiaoyang Wan, Juncheng Li, Siliang Tang, Hanwang Zhang, and Yueting Zhuang. 2024. AnyEdit: Mastering Unified High-Quality Image Editing for Any Idea. arXiv preprint arXiv:2411.15738 (2024)."},{"key":"e_1_3_2_1_56_1","volume-title":"Advances in Neural Information Processing Systems","volume":"36","author":"Zhang Kai","year":"2024","unstructured":"Kai Zhang, Lingbo Mo, Wenhu Chen, Huan Sun, and Yu Su. 2024. Magicbrush: A manually annotated dataset for instruction-guided image editing. Advances in Neural Information Processing Systems, Vol. 36 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"UltraEdit: Instruction-based Fine-Grained Image Editing at Scale. arXiv preprint arXiv:2407.05282","author":"Zhao Haozhe","year":"2024","unstructured":"Haozhe Zhao, Xiaojian Ma, Liang Chen, Shuzheng Si, Rujie Wu, Kaikai An, Peiyu Yu, Minjia Zhang, Qing Li, and Baobao Chang. 2024. UltraEdit: Instruction-based Fine-Grained Image Editing at Scale. arXiv preprint arXiv:2407.05282 (2024)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755811","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:15:57Z","timestamp":1765340157000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755811"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":57,"alternative-id":["10.1145\/3746027.3755811","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755811","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}