{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T03:23:24Z","timestamp":1781234604812,"version":"3.54.1"},"publisher-location":"Cham","reference-count":68,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726668","type":"print"},{"value":"9783031726675","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T00:00:00Z","timestamp":1727568000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72667-5_8","type":"book-chapter","created":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:11:48Z","timestamp":1727554308000},"page":"129-147","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":42,"title":["ControlNet$$++$$: Improving Conditional Controls with\u00a0Efficient Consistency Feedback"],"prefix":"10.1007","author":[{"given":"Ming","family":"Li","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Taojiannan","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huafeng","family":"Kuang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhaoning","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xuefeng","family":"Xiao","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3957-7061","authenticated-orcid":false,"given":"Chen","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,9,29]]},"reference":[{"key":"8_CR1","unstructured":"Black, K., Janner, M., Du, Y., Kostrikov, I., Levine, S.: Training diffusion models with reinforcement learning (2023). arXiv preprint arXiv:2305.13301"},{"key":"8_CR2","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: InstructPix2Pix: learning to follow image editing instructions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"8_CR3","doi-asserted-by":"crossref","unstructured":"Caesar, H., Uijlings, J., Ferrari, V.: COCO-stuff: thing and stuff classes in context. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00132"},{"key":"8_CR4","unstructured":"Chen, L.C., Papandreou, G., Schroff, F., Adam, H.: Rethinking atrous convolution for semantic image segmentation (2017). arXiv preprint arXiv:1706.05587"},{"key":"8_CR5","doi-asserted-by":"crossref","unstructured":"Chen, M., Laina, I., Vedaldi, A.: Training-free layout control with cross-attention guidance (2023). arXiv preprint arXiv:2304.03373","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"8_CR6","unstructured":"Chen, T., Xu, B., Zhang, C., Guestrin, C.: Training deep nets with sublinear memory cost. arXiv (2016)"},{"key":"8_CR7","doi-asserted-by":"crossref","unstructured":"Cheng, B., Misra, I., Schwing, A.G., Kirillov, A., Girdhar, R.: Masked-attention mask transformer for universal image segmentation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.00135"},{"key":"8_CR8","unstructured":"Cheng, B., Schwing, A., Kirillov, A.: Per-pixel classification is not all you need for semantic segmentation. NeurIPS (2021)"},{"key":"8_CR9","unstructured":"Chowdhery, A., et\u00a0al.: PaLM: Scaling language modeling with pathways (2022). arXiv preprint arXiv:2204.02311"},{"key":"8_CR10","unstructured":"Clark, K., Vicol, P., Swersky, K., Fleet, D.J.: Directly fine-tuning diffusion models on differentiable rewards (2023). arXiv preprint arXiv:2309.17400"},{"key":"8_CR11","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. NeurIPS (2021)"},{"key":"8_CR12","unstructured":"Fan, Y., et al.: DPOK: Reinforcement learning for fine-tuning text-to-image diffusion models. NeurIPS (2023)"},{"key":"8_CR13","unstructured":"Gal, R., et al.: An image is worth one word: Personalizing text-to-image generation using textual inversion. In: ICLR (2023)"},{"key":"8_CR14","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"8_CR15","unstructured":"Hertz, A., Mokady, R., Tenenbaum, J., Aberman, K., Pritch, Y., Cohen-or, D.: Prompt-to-prompt image editing with cross-attention control. In: ICLR (2023)"},{"key":"8_CR16","doi-asserted-by":"crossref","unstructured":"Hertzmann, A., Jacobs, C.E., Oliver, N., Curless, B., Salesin, D.H.: Image analogies. In: SIGGRAPH (2001)","DOI":"10.1145\/383259.383295"},{"key":"8_CR17","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. NeurIPS (2020)"},{"key":"8_CR18","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance (2022). arXiv preprint arXiv:2207.12598"},{"key":"8_CR19","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"8_CR20","unstructured":"Hu, M., et al.: Cocktail: Mixing multi-modality controls for text-conditional image generation. NeurIPS (2023)"},{"key":"8_CR21","unstructured":"Huang, L., Chen, D., Liu, Y., Shen, Y., Zhao, D., Zhou, J.: Composer: creative and controllable image synthesis with composable conditions. In: ICML (2015)"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Zhao, C., Wang, J., Zhang, L., Xu, Q.: HumanSD: a native skeleton-guided diffusion model for human image generation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.01465"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"8_CR24","unstructured":"Kingma, D., Salimans, T., Poole, B., Ho, J.: Variational diffusion models. NeurIPS (2021)"},{"key":"8_CR25","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-Pic: An open dataset of user preferences for text-to-image generation (2023). arXiv preprint arXiv:2305.01569"},{"key":"8_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Long, J., Shelhamer, E., Darrell, T.: Fully convolutional networks for semantic segmentation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"8_CR28","unstructured":"Meng, C., et al.: SDedit: guided image synthesis and editing with stochastic differential equations. In: ICLR (2022)"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models (2023). arXiv preprint arXiv:2302.08453","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"8_CR30","unstructured":"Nichol, A.Q., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. In: ICML (2022)"},{"key":"8_CR31","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. NeurIPS (2022)"},{"key":"8_CR32","doi-asserted-by":"crossref","unstructured":"Parmar, G., Zhang, R., Zhu, J.Y.: On aliased resizing and surprising subtleties in GAN evaluation. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01112"},{"key":"8_CR33","unstructured":"Podell, D., et al.: SDXL: Improving latent diffusion models for high-resolution image synthesis (2023). arXiv preprint arXiv:2307.01952"},{"key":"8_CR34","unstructured":"Prabhudesai, M., Goyal, A., Pathak, D., Fragkiadaki, K.: Aligning text-to-image diffusion models with reward backpropagation (2023). arXiv preprint arXiv:2310.03739"},{"key":"8_CR35","unstructured":"Qin, C., et\u00a0al.: UniControl: A unified diffusion model for controllable visual generation in the wild. NeurIPS (2023)"},{"key":"8_CR36","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"8_CR37","unstructured":"Raffel, C., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)"},{"key":"8_CR38","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents (2022). arXiv preprint arXiv:2204.06125"},{"key":"8_CR39","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML (2021)"},{"key":"8_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR41","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: Convolutional networks for biomedical image segmentation. In: MICCAI (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. NeurIPS (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., Chen, L.C.: MobileNetV2: inverted residuals and linear bottlenecks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00474"},{"key":"8_CR45","unstructured":"Schuhmann, C., et al.: LAION-5B: An open large-scale dataset for training next generation image-text models. ArXiv (2022)"},{"key":"8_CR46","unstructured":"Schuhmann, C., et al.: LAION-400M: Open dataset of clip-filtered 400 million image-text pairs. ArXiv (2021)"},{"key":"8_CR47","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML (2015)"},{"key":"8_CR48","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2021)"},{"key":"8_CR49","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. In: ICLR (2021)"},{"key":"8_CR50","unstructured":"Touvron, H., et\u00a0al.: Llama 2: Open foundation and fine-tuned chat models (2023). arXiv preprint arXiv:2307.09288"},{"key":"8_CR51","doi-asserted-by":"crossref","unstructured":"Wang, X., Darrell, T., Rambhatla, S.S., Girdhar, R., Misra, I.: InstanceDiffusion: Instance-level control for image generation (2024)","DOI":"10.1109\/CVPR52733.2024.00596"},{"key":"8_CR52","unstructured":"Wu, X., et al.: Human preference score v2: A solid benchmark for evaluating human preferences of text-to-image synthesis (2023). arXiv preprint arXiv:2306.09341"},{"key":"8_CR53","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Better aligning text-to-image models with human preference. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"8_CR54","doi-asserted-by":"crossref","unstructured":"Xiao, T., Liu, Y., Zhou, B., Jiang, Y., Sun, J.: Unified perceptual parsing for scene understanding. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01228-1_26"},{"key":"8_CR55","doi-asserted-by":"crossref","unstructured":"Xie, J., et al.: BoxDiff: text-to-image synthesis with training-free box-constrained diffusion. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"8_CR56","doi-asserted-by":"crossref","unstructured":"Xie, S., Tu, Z.: Holistically-nested edge detection. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.164"},{"key":"8_CR57","unstructured":"Xu, J., et al.: ImageReward: Learning and evaluating human preferences for text-to-image generation. NeurIPS (2023)"},{"key":"8_CR58","doi-asserted-by":"crossref","unstructured":"Yang, Z., et\u00a0al.: ReCo: region-controlled text-to-image generation. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01369"},{"key":"8_CR59","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: IP-adapter: Text compatible image prompt adapter for text-to-image diffusion models (2023). arXiv preprint arXiv:2308.06721"},{"key":"8_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"8_CR61","unstructured":"Zhang, T., Zhang, Y., Vineet, V., Joshi, N., Wang, X.: Controllable text-to-image generation with GPT-4 (2023). arXiv preprint arXiv:2305.18583"},{"key":"8_CR62","unstructured":"Zhao, S., et al.: Uni-ControlNet: All-in-one control to text-to-image diffusion models. NeurIPS (2023)"},{"key":"8_CR63","unstructured":"Zhao, W., Bai, L., Rao, Y., Zhou, J., Lu, J.: UniPC: A unified predictor-corrector framework for fast sampling of diffusion models (2023). arXiv preprint arXiv:2302.04867"},{"key":"8_CR64","doi-asserted-by":"crossref","unstructured":"Zhou, B., Zhao, H., Puig, X., Fidler, S., Barriuso, A., Torralba, A.: Scene parsing through ADE20K dataset. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.544"},{"key":"8_CR65","doi-asserted-by":"crossref","unstructured":"Zhou, B., et al.: Semantic understanding of scenes through the ADE20K dataset. IJCV (2019)","DOI":"10.1007\/s11263-018-1140-0"},{"key":"8_CR66","doi-asserted-by":"crossref","unstructured":"Zhou, D., Li, Y., Ma, F., Yang, Z., Yang, Y.: MIGC: multi-instance generation controller for text-to-image synthesis. In: CVPR (2024)","DOI":"10.1109\/CVPR52733.2024.00651"},{"key":"8_CR67","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: MiniGPT-4: Enhancing vision-language understanding with advanced large language models (2023). arXiv preprint arXiv:2304.10592"},{"key":"8_CR68","doi-asserted-by":"crossref","unstructured":"Zhu, J.Y., Park, T., Isola, P., Efros, A.A.: Unpaired image-to-image translation using cycle-consistent adversarial networks. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.244"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72667-5_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,28]],"date-time":"2024-09-28T20:13:55Z","timestamp":1727554435000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72667-5_8"}},"subtitle":["Project Page: liming-ai.github.io\/ControlNet_Plus_Plus"],"short-title":[],"issued":{"date-parts":[[2024,9,29]]},"ISBN":["9783031726668","9783031726675"],"references-count":68,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72667-5_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,9,29]]},"assertion":[{"value":"29 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}