{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,17]],"date-time":"2026-07-17T06:02:43Z","timestamp":1784268163955,"version":"3.55.0"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"NSFC","award":["62176008"],"award-info":[{"award-number":["62176008"]}]},{"name":"Tencent University Relations","award":["Tencent AI Lab RBFR2024006"],"award-info":[{"award-number":["Tencent AI Lab RBFR2024006"]}]},{"name":"Guangdong Provincial Key Laboratory of Ultra High Definition Immersive Media Technology","award":["2024B1212010006"],"award-info":[{"award-number":["2024B1212010006"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3757377.3763897","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T16:30:41Z","timestamp":1765211441000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["BlobCtrl: Taming Controllable Blob for Element-level Image Editing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-7325-6439","authenticated-orcid":false,"given":"Yaowei","family":"Li","sequence":"first","affiliation":[{"name":"Peking University, School of Electronic and Computer Engineering, Shenzhen, China and Peking University Shenzhen Graduate School, Guangdong Provincial Key Laboratory of Ultra High Definition Immersive Media Technology, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1313-8717","authenticated-orcid":false,"given":"Lingen","family":"Li","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5583-6454","authenticated-orcid":false,"given":"Zhaoyang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2588-1687","authenticated-orcid":false,"given":"Xiaoyu","family":"Li","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8216-648X","authenticated-orcid":false,"given":"Guangzhi","family":"Wang","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7710-8835","authenticated-orcid":false,"given":"Hongxiang","family":"Li","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technolog, Hongkong, Hong Kong"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3607-2236","authenticated-orcid":false,"given":"Xiaodong","family":"Cun","sequence":"additional","affiliation":[{"name":"GVC Lab, Great Bay University, Dongguan, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7673-8325","authenticated-orcid":false,"given":"Ying","family":"Shan","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9999-6140","authenticated-orcid":false,"given":"Yuexian","family":"Zou","sequence":"additional","affiliation":[{"name":"Peking University, School of Electronic and Computer Engineering, Shenzhen, China and Peking University Shenzhen Graduate School, Guangdong Provincial Key Laboratory of Ultra High Definition Immersive Media Technology, Shenzhen, Guangdong, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_3_3_2_1","unstructured":"Hadi Alzayer Zhihao Xia Xuaner Zhang Eli Shechtman Jia-Bin Huang and Michael Gharbi. 2024. Magic Fixup: Streamlining Photo Editing by Watching Dynamic Videos. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.13044 (2024)."},{"key":"e_1_3_3_3_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687590"},{"key":"e_1_3_3_3_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_3_3_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_3_3_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"e_1_3_3_3_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-48762-X_63"},{"key":"e_1_3_3_3_8_1","doi-asserted-by":"crossref","unstructured":"Xi Chen Lianghua Huang Yu Liu Yujun Shen Deli Zhao and Hengshuang Zhao. 2023. AnyDoor: Zero-shot Object-level Image Customization. arXiv preprint (2023).","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"e_1_3_3_3_9_1","unstructured":"Zhe Chen Weiyun Wang Yue Cao Yangzhou Liu Zhangwei Gao Erfei Cui Jinguo Zhu Shenglong Ye Hao Tian Zhaoyang Liu et\u00a0al. 2024. Expanding performance boundaries of open-source multimodal models with model data and test-time scaling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.05271 (2024)."},{"key":"e_1_3_3_3_10_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_36"},{"key":"e_1_3_3_3_11_1","volume-title":"Forty-first International Conference on Machine Learning","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et\u00a0al. 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_3_3_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01212"},{"key":"e_1_3_3_3_13_1","unstructured":"Rinon Gal Yuval Alaluf Yuval Atzmon Or Patashnik Amit\u00a0H Bermano Gal Chechik and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2208.01618 (2022)."},{"key":"e_1_3_3_3_14_1","volume-title":"ICLR","author":"Hertz Amir","year":"2023","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2023. Prompt-to-Prompt Image Editing with Cross-Attention Control. In ICLR."},{"key":"e_1_3_3_3_15_1","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. GANs trained by a two time-scale update rule converge to a local Nash equilibrium. Advances in Neural Information Processing Systems (NIPS) 30 (2017)."},{"key":"e_1_3_3_3_16_1","unstructured":"Edward\u00a0J Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2106.09685 (2021)."},{"key":"e_1_3_3_3_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"e_1_3_3_3_18_1","first-page":"150","volume-title":"European Conference on Computer Vision","author":"Ju Xuan","year":"2024","unstructured":"Xuan Ju, Xian Liu, Xintao Wang, Yuxuan Bian, Ying Shan, and Qiang Xu. 2024. Brushnet: A plug-and-play image inpainting model with decomposed dual-branch diffusion. In European Conference on Computer Vision. Springer, 150\u2013168."},{"key":"e_1_3_3_3_19_1","unstructured":"Diederik\u00a0P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1412.6980 (2014)."},{"key":"e_1_3_3_3_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"e_1_3_3_3_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_3_3_22_1","unstructured":"Black\u00a0Forest Labs. 2023. FLUX. https:\/\/github.com\/black-forest-labs\/flux."},{"key":"e_1_3_3_3_23_1","unstructured":"Yaowei Li Yuxuan Bian Xuan Ju Zhaoyang Zhang Ying Shan Yuexian Zou and Qiang Xu. 2024. BrushEdit: All-In-One Image Inpainting and Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.10316 (2024)."},{"key":"e_1_3_3_3_24_1","unstructured":"Yaowei Li Xiaoyu Li Zhaoyang Zhang Yuxuan Bian Gan Liu Xinyuan Li Jiale Xu Wenbo Hu Yating Liu Lingen Li et\u00a0al. 2025. IC-Custom: Diverse Image Customization via In-Context Learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2507.01926 (2025)."},{"key":"e_1_3_3_3_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"e_1_3_3_3_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"e_1_3_3_3_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687645"},{"key":"e_1_3_3_3_28_1","first-page":"231","volume-title":"European Conference on Computer Vision","author":"Lu Jingyi","year":"2024","unstructured":"Jingyi Lu, Xinghui Li, and Kai Han. 2024. Regiondrag: Fast region-based image editing with diffusion models. In European Conference on Computer Vision. Springer, 231\u2013246."},{"key":"e_1_3_3_3_29_1","unstructured":"PC Mahalanobis. 1936. On the generalized distance in Statistics. National Institute of Science of India."},{"key":"e_1_3_3_3_30_1","unstructured":"Chaojie Mao Jingfeng Zhang Yulin Pan Zeyinzi Jiang Zhen Han Yu Liu and Jingren Zhou. 2025. Ace++: Instruction-based image creation and editing via context-aware content filling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.02487 (2025)."},{"key":"e_1_3_3_3_31_1","unstructured":"Chong Mou Xintao Wang Jiechong Song Ying Shan and Jian Zhang. 2023. DragonDiffusion: Enabling Drag-style Manipulation on Diffusion Models. arxiv:https:\/\/arXiv.org\/abs\/2307.02421\u00a0[cs.CV]"},{"key":"e_1_3_3_3_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00811"},{"key":"e_1_3_3_3_33_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72627-9_3"},{"key":"e_1_3_3_3_34_1","volume-title":"Forty-first International Conference on Machine Learning","author":"Nie Weili","year":"2024","unstructured":"Weili Nie, Sifei Liu, Morteza Mardani, Chao Liu, Benjamin Eckart, and Arash Vahdat. 2024. Compositional Text-to-Image Generation with Dense Blob Representations. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_3_3_35_1","unstructured":"Maxime Oquab Timoth\u00e9e Darcet Theo Moutakanni Huy\u00a0V. Vo Marc Szafraniec Vasil Khalidov Pierre Fernandez Daniel Haziza Francisco Massa Alaaeldin El-Nouby Russell Howes Po-Yao Huang Hu Xu Vasu Sharma Shang-Wen Li Wojciech Galuba Mike Rabbat Mido Assran Nicolas Ballas Gabriel Synnaeve Ishan Misra Herve Jegou Julien Mairal Patrick Labatut Armand Joulin and Piotr Bojanowski. 2023. DINOv2: Learning Robust Visual Features without Supervision."},{"key":"e_1_3_3_3_36_1","first-page":"8748","volume-title":"International conference on machine learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748\u20138763."},{"key":"e_1_3_3_3_37_1","unstructured":"Aditya Ramesh Prafulla Dhariwal Alex Nichol Casey Chu and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2204.06125 1 2 (2022) 3."},{"key":"e_1_3_3_3_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_3_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_3_3_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00847"},{"key":"e_1_3_3_3_41_1","unstructured":"Yichun Shi Peng Wang and Weilin Huang. 2024. SeedEdit: Align Image Re-Generation to Image Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2411.06686 (2024)."},{"key":"e_1_3_3_3_42_1","unstructured":"Yujun Shi Chuhui Xue Jiachun Pan Wenqing Zhang Vincent\u00a0YF Tan and Song Bai. 2023. DragDiffusion: Harnessing Diffusion Models for Interactive Point-based Image Editing. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2306.14435 (2023)."},{"key":"e_1_3_3_3_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687668"},{"key":"e_1_3_3_3_44_1","unstructured":"Wensong Song Hong Jiang Zongxing Yang Ruijie Quan and Yi Yang. 2025. Insert anything: Image insertion via in-context editing in dit. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.15009 (2025)."},{"key":"e_1_3_3_3_45_1","unstructured":"Qixun Wang Xu Bai Haofan Wang Zekui Qin Anthony Chen Huaxia Li Xu Tang and Yao Hu. 2024. Instantid: Zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2401.07519 (2024)."},{"key":"e_1_3_3_3_46_1","unstructured":"Qian Wang Yiqun Wang Michael Birsak and Peter Wonka. 2023. Blobgan-3d: A spatially-disentangled 3d-aware generative model for indoor scenes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.14706 (2023)."},{"key":"e_1_3_3_3_47_1","doi-asserted-by":"crossref","unstructured":"Zhou Wang Alan\u00a0C Bovik Hamid\u00a0R Sheikh and Eero\u00a0P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13 4 (2004) 600\u2013612.","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_3_48_1","unstructured":"Wikipedia contributors. 2024. Peak signal-to-noise ratio \u2014 Wikipedia The Free Encyclopedia. https:\/\/en.wikipedia.org\/w\/index.php?title=Peak_signal-to-noise_ratio&oldid=1210897995 [Online; accessed 4-March-2024]."},{"key":"e_1_3_3_3_49_1","first-page":"331","volume-title":"European Conference on Computer Vision","author":"Wu Weijia","year":"2024","unstructured":"Weijia Wu, Zhuang Li, Yuchao Gu, Rui Zhao, Yefei He, David\u00a0Junhao Zhang, Mike\u00a0Zheng Shou, Yan Li, Tingting Gao, and Di Zhang. 2024. Draganything: Motion control for anything using entity representation. In European Conference on Computer Vision. Springer, 331\u2013348."},{"key":"e_1_3_3_3_50_1","unstructured":"Zhexiao Xiong Wei Xiong Jing Shi He Zhang Yizhi Song and Nathan Jacobs. 2024. GroundingBooth: Grounding Text-to-Image Customization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.08520 (2024)."},{"key":"e_1_3_3_3_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_3_3_52_1","unstructured":"Hu Ye Jun Zhang Sibo Liu Xiao Han and Wei Yang. 2023. Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2308.06721 (2023)."},{"key":"e_1_3_3_3_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00406"},{"key":"e_1_3_3_3_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02433"},{"key":"e_1_3_3_3_55_1","unstructured":"Hui Zhang Dexiang Hong Tingwei Gao Yitong Wang Jie Shao Xinglong Wu Zuxuan Wu and Yu-Gang Jiang. 2024. CreatiLayout: Siamese Multimodal Diffusion Transformer for Creative Layout-to-Image Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03859 (2024)."},{"key":"e_1_3_3_3_56_1","doi-asserted-by":"crossref","unstructured":"Lvmin Zhang Anyi Rao and Maneesh Agrawala. 2023b. Adding Conditional Control to Text-to-Image Diffusion Models.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_3_3_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_3_58_1","doi-asserted-by":"publisher","DOI":"10.1111\/cgf.14966"}],"event":{"name":"SA Conference Papers '25: SIGGRAPH Asia 2025 Conference Papers","location":"Hong Kong Hong Kong","acronym":"SA Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the SIGGRAPH Asia 2025 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757377.3763897","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T03:26:01Z","timestamp":1765250761000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757377.3763897"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":57,"alternative-id":["10.1145\/3757377.3763897","10.1145\/3757377"],"URL":"https:\/\/doi.org\/10.1145\/3757377.3763897","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}