{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T05:16:41Z","timestamp":1781587001786,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Hong Kong Research Grants Council","award":["CityU 11208123"],"award-info":[{"award-number":["CityU 11208123"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612200","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T07:27:30Z","timestamp":1698391650000},"page":"3190-3199","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":66,"title":["Uni-paint: A Unified Framework for Multimodal Image Inpainting with Pretrained Diffusion Model"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8213-5803","authenticated-orcid":false,"given":"Shiyuan","family":"Yang","sequence":"first","affiliation":[{"name":"City University of Hong Kong, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1624-2680","authenticated-orcid":false,"given":"Xiaodong","family":"Chen","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7014-5377","authenticated-orcid":false,"given":"Jing","family":"Liao","sequence":"additional","affiliation":[{"name":"City University of Hong Kong, Hong Kong SAR, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592450"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01767"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/1531326.1531330"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592116"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2004.833105"},{"key":"e_1_3_2_1_7_1","volume-title":"An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618","author":"Gal Rinon","year":"2022","unstructured":"Rinon Gal, Yuval Alaluf, Yuval Atzmon, Or Patashnik, Amit H Bermano, Gal Chechik, and Daniel Cohen-Or. 2022. An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)."},{"key":"e_1_3_2_1_8_1","volume-title":"SVDiff: Compact Parameter Space for Diffusion Fine-Tuning. arXiv preprint arXiv:2303.11305","author":"Han Ligong","year":"2023","unstructured":"Ligong Han, Yinxiao Li, Han Zhang, Peyman Milanfar, Dimitris Metaxas, and Feng Yang. 2023. SVDiff: Compact Parameter Space for Diffusion Fine-Tuning. arXiv preprint arXiv:2303.11305 (2023)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2014.2330611"},{"key":"e_1_3_2_1_10_1","volume-title":"Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626","author":"Hertz Amir","year":"2022","unstructured":"Amir Hertz, Ron Mokady, Jay Tenenbaum, Kfir Aberman, Yael Pritch, and Daniel Cohen-Or. 2022. Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)."},{"key":"e_1_3_2_1_11_1","volume-title":"Ronan Le Bras, and Yejin Choi","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. 2021. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)."},{"key":"e_1_3_2_1_12_1","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in Neural Information Processing Systems, Vol. 33 (2020), 6840--6851.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_14_1","volume-title":"Shape-Guided Diffusion with Inside-Outside Attention. arXiv e-prints","author":"Park Dong Huk","year":"2022","unstructured":"Dong Huk Park, Grace Luo, Clayton Toste, Samaneh Azadi, Xihui Liu, Maka Karalashvili, Anna Rohrbach, and Trevor Darrell. 2022. Shape-Guided Diffusion with Inside-Outside Attention. arXiv e-prints (2022), arXiv--2212."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"e_1_3_2_1_18_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00606"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00778"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_6"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00037"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"e_1_3_2_1_25_1","volume-title":"Sdedit: Image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073","author":"Meng Chenlin","year":"2021","unstructured":"Chenlin Meng, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan Zhu, and Stefano Ermon. 2021. Sdedit: Image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"e_1_3_2_1_27_1","volume-title":"Edgeconnect: Generative image inpainting with adversarial edge learning. arXiv preprint arXiv:1901.00212","author":"Nazeri Kamyar","year":"2019","unstructured":"Kamyar Nazeri, Eric Ng, Tony Joseph, Faisal Z Qureshi, and Mehran Ebrahimi. 2019. Edgeconnect: Generative image inpainting with adversarial edge learning. arXiv preprint arXiv:1901.00212 (2019)."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.278"},{"key":"e_1_3_2_1_30_1","volume-title":"International conference on machine learning. PMLR, 8748--8763","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et al. 2021. Learning transferable visual models from natural language supervision. In International conference on machine learning. PMLR, 8748--8763."},{"key":"e_1_3_2_1_31_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"e_1_3_2_1_35_1","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia Chitwan","year":"2022","unstructured":"Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily L Denton, Kamyar Ghasemipour, Raphael Gontijo Lopes, Burcu Karagol Ayan, Tim Salimans, et al. 2022b. Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, Vol. 35 (2022), 36479--36494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_36_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_1"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00323"},{"key":"e_1_3_2_1_39_1","volume-title":"NIMA: Neural image assessment","author":"Talebi Hossein","year":"2018","unstructured":"Hossein Talebi and Peyman Milanfar. 2018. NIMA: Neural image assessment. IEEE transactions on image processing, Vol. 27, 8 (2018), 3998--4011."},{"key":"e_1_3_2_1_40_1","volume-title":"Unitune: Text-driven image editing by fine tuning an image generation model on a single image. arXiv preprint arXiv:2210.09477","author":"Valevski Dani","year":"2022","unstructured":"Dani Valevski, Matan Kalman, Yossi Matias, and Yaniv Leviathan. 2022. Unitune: Text-driven image editing by fine tuning an image generation model on a single image. arXiv preprint arXiv:2210.09477 (2022)."},{"key":"e_1_3_2_1_41_1","volume-title":"Sketch-Guided Text-to-Image Diffusion Models. arXiv preprint arXiv:2211.13752","author":"Voynov Andrey","year":"2022","unstructured":"Andrey Voynov, Kfir Aberman, and Daniel Cohen-Or. 2022. Sketch-Guided Text-to-Image Diffusion Models. arXiv preprint arXiv:2211.13752 (2022)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00465"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01761"},{"key":"e_1_3_2_1_44_1","volume-title":"Elite: Encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848","author":"Wei Yuxiang","year":"2023","unstructured":"Yuxiang Wei, Yabo Zhang, Zhilong Ji, Jinfeng Bai, Lei Zhang, and Wangmeng Zuo. 2023. Elite: Encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00599"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01763"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00787"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00577"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00457"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240625"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612200","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612200","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:06:53Z","timestamp":1755821213000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612200"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":51,"alternative-id":["10.1145\/3581783.3612200","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612200","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}