{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,2]],"date-time":"2025-11-02T18:35:09Z","timestamp":1762108509191,"version":"build-2065373602"},"publisher-location":"Cham","reference-count":38,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031731150"},{"type":"electronic","value":"9783031731167"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73116-7_9","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:15:38Z","timestamp":1730301338000},"page":"144-160","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["LayerDiff: Exploring Text-Guided Multi-layered Composable Image Synthesis via\u00a0Layer-Collaborative Diffusion Model"],"prefix":"10.1007","author":[{"given":"Runhui","family":"Huang","sequence":"first","affiliation":[]},{"given":"Kaixin","family":"Cai","sequence":"additional","affiliation":[]},{"given":"Jianhua","family":"Han","sequence":"additional","affiliation":[]},{"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[]},{"given":"Renjing","family":"Pei","sequence":"additional","affiliation":[]},{"given":"Guansong","family":"Lu","sequence":"additional","affiliation":[]},{"given":"Songcen","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Wei","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Hang","family":"Xu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"9_CR1","unstructured":"Birkl, R., Wofk, D., M\u00fcller, M.: MiDaS v3.1 \u2013 a model zoo for robust monocular relative depth estimation. arXiv preprint arXiv:2307.14460 (2023)"},{"key":"9_CR2","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. arXiv preprint arXiv:1809.11096 (2018)"},{"key":"9_CR3","doi-asserted-by":"publisher","first-page":"679","DOI":"10.1109\/TPAMI.1986.4767851","volume":"6","author":"J Canny","year":"1986","unstructured":"Canny, J.: A computational approach to edge detection. IEEE Trans. Pattern Anal. Mach. Intell. 6, 679\u2013698 (1986)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR4","doi-asserted-by":"publisher","first-page":"172","DOI":"10.1109\/TPAMI.2019.2929257","volume":"43","author":"Z Cao","year":"2019","unstructured":"Cao, Z., Hidalgo Martinez, G., Simon, T., Wei, S., Sheikh, Y.A.: OpenPose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Intell. 43, 172\u2013186 (2019)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"9_CR5","unstructured":"Dai, W., et al.: InstructBLIP: towards general-purpose vision-language models with instruction tuning. arXiv preprint arXiv:2305.06500 (2023)"},{"key":"9_CR6","unstructured":"Gal, R., et al.: An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"issue":"11","key":"9_CR7","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"9_CR8","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"9_CR9","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-Or, D.: Prompt-to-prompt image editing with cross attention control. arXiv preprint arXiv:2208.01626 (2022)"},{"key":"9_CR10","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Le\u00a0Bras, R., Choi, Y.: CLIPscore: a reference-free evaluation metric for image captioning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 7514\u20137528 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"9_CR11","unstructured":"Ho, J., Chen, X., Srinivas, A., Duan, Y., Abbeel, P.: Flow++: improving flow-based generative models with variational dequantization and architecture design. In: International Conference on Machine Learning, pp. 2722\u20132730. PMLR (2019)"},{"key":"9_CR12","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"9_CR13","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"9_CR14","doi-asserted-by":"crossref","unstructured":"Hong, S., Lee, G., Jang, W., Kim, S.: Improving sample quality of diffusion models using self-attention guidance. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7462\u20137471 (2023)","DOI":"10.1109\/ICCV51070.2023.00686"},{"key":"9_CR15","doi-asserted-by":"publisher","unstructured":"Honnibal, M., Montani, I., Van\u00a0Landeghem, S., Boyd, A.: spaCy: industrial-strength natural language processing in Python (2020). https:\/\/doi.org\/10.5281\/zenodo.1212303","DOI":"10.5281\/zenodo.1212303"},{"key":"9_CR16","unstructured":"Huang, H., He, R., Sun, Z., Tan, T., et\u00a0al.: IntroVAE: introspective variational autoencoders for photographic image synthesis. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"9_CR17","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10124\u201310134 (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"9_CR18","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et al.: Segment anything (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"9_CR19","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"9_CR20","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"9_CR21","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: ICML, pp. 8162\u20138171. PMLR (2021)"},{"key":"9_CR22","doi-asserted-by":"publisher","unstructured":"Obukhov, A., Seitzer, M., Wu, P.W., Zhydenko, S., Kyl, J., Lin, E.Y.J.: High-fidelity performance metrics for generative models in pytorch (2020). https:\/\/doi.org\/10.5281\/zenodo.4957738, https:\/\/github.com\/toshas\/torch-fidelity, version: 0.3.0","DOI":"10.5281\/zenodo.4957738"},{"key":"9_CR23","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"9_CR24","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"9_CR25","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"9_CR26","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"9_CR27","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487 (2022)"},{"key":"9_CR28","unstructured":"Schuhmann, C., et al.: LAION-400M: open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"9_CR29","doi-asserted-by":"crossref","unstructured":"Shao, S., et al.: Objects365: a large-scale, high-quality dataset for object detection. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 8430\u20138439 (2019)","DOI":"10.1109\/ICCV.2019.00852"},{"key":"9_CR30","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"9_CR31","doi-asserted-by":"crossref","unstructured":"Tumanyan, N., Geyer, M., Bagon, S., Dekel, T.: Plug-and-play diffusion features for text-driven image-to-image translation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1921\u20131930 (2023)","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"9_CR32","unstructured":"Wu, C., et al.: NUWA-infinity: autoregressive over autoregressive generation for infinite visual synthesis. arXiv preprint arXiv:2207.09814 (2022)"},{"key":"9_CR33","doi-asserted-by":"crossref","unstructured":"Xue, H., Huang, Z., Sun, Q., Song, L., Zhang, W.: Freestyle layout-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 14256\u201314266 (June 2023)","DOI":"10.1109\/CVPR52729.2023.01370"},{"key":"9_CR34","unstructured":"Yao, L., et al.: DetCLIP: dictionary-enriched visual-concept paralleled pre-training for open-world detection. In: NeurIPS (2022)"},{"key":"9_CR35","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.107892(3), 5 (2022)"},{"key":"9_CR36","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"9_CR37","unstructured":"Zhang, X., Zhao, W., Lu, X., Chien, J.: Text2layer: layered image generation using latent diffusion model. arXiv preprint arXiv:2307.09781 (2023)"},{"key":"9_CR38","doi-asserted-by":"crossref","unstructured":"Zheng, G., Zhou, X., Li, X., Qi, Z., Shan, Y., Li, X.: LayoutDiffusion: controllable diffusion model for layout-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22490\u201322499 (2023)","DOI":"10.1109\/CVPR52729.2023.02154"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73116-7_9","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:21:33Z","timestamp":1730301693000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73116-7_9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031731150","9783031731167"],"references-count":38,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73116-7_9","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}