{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:11:05Z","timestamp":1778051465609,"version":"3.51.4"},"reference-count":41,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00316","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"3233-3242","source":"Crossref","is-referenced-by-count":0,"title":["PSDiffusion: Harmonized Multi-Layer Image Generation via Layout and Appearance Alignment"],"prefix":"10.1109","author":[{"given":"Dingbang","family":"Huang","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wenbo","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifei","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xinyu","family":"Pan","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yanhong","family":"Zeng","sequence":"additional","affiliation":[{"name":"Shanghai AI Laboratory"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bo","family":"Dai","sequence":"additional","affiliation":[{"name":"The University of Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00608"},{"key":"ref2","article-title":"Qwen2.5-vl technical report","author":"Bai","year":"2025"},{"issue":"3","key":"ref3","first-page":"8","volume-title":"Computer Science","volume":"2","author":"Betker","year":"2023"},{"key":"ref4","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02132"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"ref7","article-title":"Layerfusion: Harmonized multi-layer text-to-image generation with generative priors","author":"Dalva","year":"2024"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0714"},{"key":"ref9","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Forty-first international Conference on machine learning","author":"Esser"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1392"},{"key":"ref11","article-title":"Prompt-to-prompt image editing with cross-attention control","volume-title":"The Eleventh International Conference on Learning Representations","author":"Hertz"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref13","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in neural information processing systems"},{"issue":"2","key":"ref14","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i8.28696"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73113-6_23"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73116-7_9"},{"key":"ref18","article-title":"Layeringdiff: Layered image synthesis via generation, then disassembly with generative knowledge","author":"Kang","year":"2025"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref20","volume-title":"Flux","year":"2024"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01541-0"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00184"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"ref27","article-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00745"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92808-6_3"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2643"},{"key":"ref32","article-title":"Laion-400m: Open dataset of clip-filtered 400 million image-text pairs","author":"Schuhmann","year":"2021"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0068"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02115"},{"key":"ref35","article-title":"Wan: Open and advanced large-scale video generative models","author":"Wan","year":"2025"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00716"},{"key":"ref37","article-title":"Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models","author":"Ye","year":"2023","journal-title":"Computing Research Repository"},{"key":"ref38","article-title":"Inpaint anything: Segment anything meets image inpainting","author":"Yu","year":"2023"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3658150"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref41","article-title":"Text2layer: Layered image generation using latent diffusion model","author":"Zhang","year":"2023"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492637.pdf?arnumber=11492637","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:12:47Z","timestamp":1778047967000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492637\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":41,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00316","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}