{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T07:21:01Z","timestamp":1781335261997,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681199","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:49Z","timestamp":1729925989000},"page":"3342-3351","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["LoMOE: Localized Multi-Object Editing via Multi-Diffusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-3628-3649","authenticated-orcid":false,"given":"Goirik","family":"Chakrabarty","sequence":"first","affiliation":[{"name":"TCS Research, New Delhi, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4076-3427","authenticated-orcid":false,"given":"Aditya","family":"Chandrasekar","sequence":"additional","affiliation":[{"name":"IISc Bangalore &amp; TCS Reseach, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1186-6311","authenticated-orcid":false,"given":"Ramya","family":"Hebbalaguppe","sequence":"additional","affiliation":[{"name":"TCS Reseach &amp; IIT Delhi, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8699-5760","authenticated-orcid":false,"given":"Prathosh","family":"AP","sequence":"additional","affiliation":[{"name":"IISc Bangalore, Bengaluru, India"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"Alex Andonian Sabrina Osmany Audrey Cui YeonHwan Park Ali Jahanian Antonio Torralba and David Bau. 2021. Paint by Word. arXiv:arXiv:2103.10951"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592450"},{"key":"e_1_3_2_2_3_1","unstructured":"Omer Bar-Tal Lior Yariv Yaron Lipman and Tali Dekel. 2023. MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation. In ICML. PMLR."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00460"},{"key":"e_1_3_2_2_5_1","volume-title":"Ledits: Limitless image editing using text-to-image models. arXiv preprint arXiv:2311.16711","author":"Brack Manuel","year":"2023","unstructured":"Manuel Brack, Felix Friedrich, Katharina Kornmeier, Linoy Tsaban, Patrick Schramowski, Kristian Kersting, and Apolin\u00e1rio Passos. 2023. Ledits: Limitless image editing using text-to-image models. arXiv preprint arXiv:2311.16711 (2023)."},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Aditya Chandrasekar Goirik Chakrabarty Jai Bardhan Ramya Hebbalaguppe and Prathosh AP. 2024. ReMOVE: A Reference-free Metric for Object Erasure. arXiv preprint arXiv:2409.00707 (2024).","DOI":"10.1109\/CVPRW63382.2024.00788"},{"key":"e_1_3_2_2_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00821"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/34.400565"},{"key":"e_1_3_2_2_12_1","volume-title":"The Eleventh International Conference on Learning Representations. https:\/\/openreview. net\/forum?id=3lge0p5o-M-","author":"Couairon Guillaume","year":"2023","unstructured":"Guillaume Couairon, Jakob Verbeek, Holger Schwenk, and Matthieu Cord. 2023. DiffEdit: Diffusion-based semantic image editing with mask guidance. In The Eleventh International Conference on Learning Representations. https:\/\/openreview. net\/forum?id=3lge0p5o-M-"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_6"},{"key":"e_1_3_2_2_14_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021), 8780--8794."},{"key":"e_1_3_2_2_15_1","first-page":"16222","article-title":"Diffusion self-guidance for controllable image generation","volume":"36","author":"Epstein Dave","year":"2023","unstructured":"Dave Epstein, Allan Jabri, Ben Poole, Alexei Efros, and Aleksander Holynski. 2023. Diffusion self-guidance for controllable image generation. Advances in Neural Information Processing Systems 36 (2023), 16222--16239.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530164"},{"key":"e_1_3_2_2_18_1","volume-title":"Generative adversarial nets. Advances in neural information processing systems 27","author":"Goodfellow Ian","year":"2014","unstructured":"Ian Goodfellow, Jean Pouget-Abadie, Mehdi Mirza, Bing Xu, David Warde-Farley, Sherjil Ozair, Aaron Courville, and Yoshua Bengio. 2014. Generative adversarial nets. Advances in neural information processing systems 27 (2014)."},{"key":"e_1_3_2_2_19_1","volume-title":"Localized Text-to-Image Generation for Free via Cross Attention Control. arXiv preprint arXiv:2306.14636","author":"He Yutong","year":"2023","unstructured":"Yutong He, Ruslan Salakhutdinov, and J Zico Kolter. 2023. Localized Text-to-Image Generation for Free via Cross Attention Control. arXiv preprint arXiv:2306.14636 (2023)."},{"key":"e_1_3_2_2_20_1","unstructured":"Amir Hertz Ron Mokady Jay Tenenbaum Kfir Aberman Yael Pritch and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. (2022)."},{"key":"e_1_3_2_2_21_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_2_22_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840--6851."},{"key":"e_1_3_2_2_23_1","volume-title":"Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications. https:\/\/openreview.net\/forum?id=qw8AKxfYbI","author":"Ho Jonathan","year":"2021","unstructured":"Jonathan Ho and Tim Salimans. 2021. Classifier-Free Diffusion Guidance. In NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications. https:\/\/openreview.net\/forum?id=qw8AKxfYbI"},{"key":"e_1_3_2_2_24_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Ju Xuan","year":"2024","unstructured":"Xuan Ju, Ailing Zeng, Yuxuan Bian, Shaoteng Liu, and Qiang Xu. 2024. PnP Inversion: Boosting Diffusion-based Editing with 3 Lines of Code. International Conference on Learning Representations (ICLR) (2024)."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00708"},{"key":"e_1_3_2_2_27_1","volume-title":"Auto-Encoding Variational Bayes. In International Conference on Learning Representations, ICLR","author":"Diederik","year":"2014","unstructured":"Diederik P. Kingma and Max Welling. 2014. Auto-Encoding Variational Bayes. In International Conference on Learning Representations, ICLR 2014, Yoshua Bengio and Yann LeCun (Eds.)."},{"key":"e_1_3_2_2_28_1","volume-title":"arXiv:2304.02643","author":"Kirillov Alexander","year":"2023","unstructured":"Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alexander C. Berg, Wan-Yen Lo, Piotr Doll\u00e1r, and Ross Girshick. 2023. Segment Anything. arXiv:2304.02643 (2023)."},{"key":"e_1_3_2_2_29_1","volume-title":"BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML.","author":"Li Junnan","year":"2022","unstructured":"Junnan Li, Dongxu Li, Caiming Xiong, and Steven Hoi. 2022. BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation. In ICML."},{"key":"e_1_3_2_2_30_1","volume-title":"GLIGEN: Open-Set Grounded Textto- Image Generation. CVPR","author":"Li Yuheng","year":"2023","unstructured":"Yuheng Li, Haotian Liu, Qingyang Wu, Fangzhou Mu, Jianwei Yang, Jianfeng Gao, Chunyuan Li, and Yong Jae Lee. 2023. GLIGEN: Open-Set Grounded Textto- Image Generation. CVPR (2023)."},{"key":"e_1_3_2_2_31_1","volume-title":"Proceedings, Part V 13","author":"Lin Tsung-Yi","year":"2014","unstructured":"Tsung-Yi Lin, Michael Maire, Serge Belongie, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C Lawrence Zitnick. 2014. Microsoft coco: Common objects in context. In Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6--12, 2014, Proceedings, Part V 13. Springer, 740-- 755."},{"key":"e_1_3_2_2_32_1","volume-title":"Jimmy Lei Ba, and Ruslan Salakhutdinov","author":"Mansimov Elman","year":"2015","unstructured":"Elman Mansimov, Emilio Parisotto, Jimmy Lei Ba, and Ruslan Salakhutdinov. 2015. Generating images from captions with attention. arXiv preprint arXiv:1511.02793 (2015)."},{"key":"e_1_3_2_2_33_1","volume-title":"Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073","author":"Meng Chenlin","year":"2021","unstructured":"Chenlin Meng, Yutong He, Yang Song, Jiaming Song, JiajunWu, Jun-Yan Zhu, and Stefano Ermon. 2021. Sdedit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)."},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_2_35_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. PMLR, 16784--16804","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. PMLR, 16784--16804."},{"key":"e_1_3_2_2_36_1","volume-title":"GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. PMLR, 16784--16804","author":"Nichol Alexander Quinn","year":"2022","unstructured":"Alexander Quinn Nichol, Prafulla Dhariwal, Aditya Ramesh, Pranav Shyam, Pamela Mishkin, Bob Mcgrew, Ilya Sutskever, and Mark Chen. 2022. GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models. In International Conference on Machine Learning. PMLR, 16784--16804."},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_20"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591513"},{"key":"e_1_3_2_2_39_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_2_40_1","volume-title":"International Conference on Machine Learning. PMLR, 8821--8831","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. 2021. Zero-shot text-to-image generation. In International Conference on Machine Learning. PMLR, 8821--8831."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_2_42_1","volume-title":"Denoising Diffusion Implicit Models. In International Conference on Learning Representations. https: \/\/openreview.net\/forum?id=St1giarCHLP","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations. https: \/\/openreview.net\/forum?id=St1giarCHLP"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"e_1_3_2_2_44_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I. Guyon, U. Von Luxburg, S. Bengio, H.Wallach, R. Fergus, S. Vishwanathan, and R. Garnett (Eds.), Vol. 30. Curran Associates, Inc. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/ 2017\/file\/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"e_1_3_2_2_46_1","volume-title":"Image quality assessment: from error visibility to structural similarity","author":"Wang Zhou","year":"2004","unstructured":"Zhou Wang, Alan C Bovik, Hamid R Sheikh, and Eero P Simoncelli. 2004. Image quality assessment: from error visibility to structural similarity. IEEE transactions on image processing 13, 4 (2004), 600--612."},{"key":"e_1_3_2_2_47_1","volume-title":"Clip-gen: Language-free training of a text-to-image generator with clip. arXiv preprint arXiv:2203.00386","author":"Wang Zihao","year":"2022","unstructured":"Zihao Wang, Wei Liu, Qian He, Xinglong Wu, and Zili Yi. 2022. Clip-gen: Language-free training of a text-to-image generator with clip. arXiv preprint arXiv:2203.00386 (2022)."},{"key":"e_1_3_2_2_48_1","volume-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341","author":"Wu Xiaoshi","year":"2023","unstructured":"Xiaoshi Wu, Yiming Hao, Keqiang Sun, Yixiong Chen, Feng Zhu, Rui Zhao, and Hongsheng Li. 2023. Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)."},{"key":"e_1_3_2_2_49_1","volume-title":"Imagereward: Learning and evaluating human preferences for text-to-image generation. Advances in Neural Information Processing Systems 36","author":"Xu Jiazheng","year":"2024","unstructured":"Jiazheng Xu, Xiao Liu, Yuchen Wu, Yuxuan Tong, Qinkai Li, Ming Ding, Jie Tang, and Yuxiao Dong. 2024. Imagereward: Learning and evaluating human preferences for text-to-image generation. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681199","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681199","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:02Z","timestamp":1750295882000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681199"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":51,"alternative-id":["10.1145\/3664647.3681199","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681199","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}