{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T20:52:00Z","timestamp":1758055920828,"version":"3.44.0"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T00:00:00Z","timestamp":1751846400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Major Fundamental Research Project of Shandong, China","award":["ZR2024ZD08","ZR2024ZD08","ZR2024ZD08","ZR2024ZD08"],"award-info":[{"award-number":["ZR2024ZD08","ZR2024ZD08","ZR2024ZD08","ZR2024ZD08"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimedia Systems"],"published-print":{"date-parts":[[2025,8]]},"DOI":"10.1007\/s00530-025-01892-5","type":"journal-article","created":{"date-parts":[[2025,7,7]],"date-time":"2025-07-07T09:04:10Z","timestamp":1751879050000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Integrating global signals with fine-grained consistency for conditional image generation"],"prefix":"10.1007","volume":"31","author":[{"given":"Guoqiang","family":"Dang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongmei","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fucheng","family":"Cao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,7,7]]},"reference":[{"key":"1892_CR1","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat gans on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR2","doi-asserted-by":"crossref","unstructured":"Rombach, R., et al.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp.\u00a010684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"1892_CR3","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"issue":"9","key":"1892_CR4","doi-asserted-by":"publisher","first-page":"10850","DOI":"10.1109\/TPAMI.2023.3261988","volume":"45","author":"F-A Croitoru","year":"2023","unstructured":"Croitoru, F.-A., et al.: Diffusion models in vision: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 45(9), 10850\u201310869 (2023)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"1892_CR5","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR6","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR7","unstructured":"Nichol, A., et al.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. In: arXiv preprint arXiv:2112.10741 (2021)"},{"key":"1892_CR8","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp.\u00a03836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"1892_CR9","unstructured":"Qin, C., et al.: Unicontrol: a unified diffusion model for controllable visual generation in the wild. In: arXiv preprint arXiv:2305.11147 (2023)"},{"key":"1892_CR10","doi-asserted-by":"crossref","unstructured":"Ghiasi, G., et al.: Multi-task self-training for learning general representations. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision. pp.\u00a08856\u20138865 (2021)","DOI":"10.1109\/ICCV48922.2021.00873"},{"key":"1892_CR11","doi-asserted-by":"crossref","unstructured":"Li, M., et al.: ControlNet++: improving conditional controls with efficient consistency feedback: project page: liming-ai. github. io\/ControlNet_Plus_Plus. In: European Conference on Computer Vision. Springer, pp.\u00a0129\u2013147 (2024)","DOI":"10.1007\/978-3-031-72667-5_8"},{"key":"1892_CR12","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp.\u00a010696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"issue":"5","key":"1892_CR13","first-page":"4296","volume":"38","author":"C Mou","year":"2024","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. Proc. AAAI Conf. Artif. Intell. 38(5), 4296\u20134304 (2024)","journal-title":"Proc. AAAI Conf. Artif. Intell."},{"key":"1892_CR14","doi-asserted-by":"crossref","unstructured":"Zheng, G., et al.: Layoutdiffusion: controllable diffusion model for layout-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp.\u00a022490\u201322499 (2023)","DOI":"10.1109\/CVPR52729.2023.02154"},{"key":"1892_CR15","first-page":"357","volume":"1","author":"C-FR Chen","year":"2021","unstructured":"Chen, C.-F.R., Fan, Q., Panda, R.: Crossvit: cross-attention multi-scale vision transformer for image classification. Proc. IEEE\/CVF Int. Conf. Comput. Vis. 1, 357\u2013366 (2021)","journal-title":"Proc. IEEE\/CVF Int. Conf. Comput. Vis."},{"key":"1892_CR16","doi-asserted-by":"crossref","unstructured":"Du, G., et al.: Medical image segmentation based on U-net: a review. J. Imaging Sci. Technol. 64(2) (2020)","DOI":"10.2352\/J.ImagingSci.Technol.2020.64.2.020508"},{"key":"1892_CR17","unstructured":"Huang, L., et al.: Composer: creative and controllable image synthesis with composable conditions. In: arXiv preprint arXiv:2302.09778 (2023)"},{"key":"1892_CR18","doi-asserted-by":"crossref","unstructured":"Mo, S., et al.: Freecontrol: training-free spatial control of any text-to-image diffusion model with any condition. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp.\u00a07465\u20137475 (2024)","DOI":"10.1109\/CVPR52733.2024.00713"},{"key":"1892_CR19","unstructured":"Mi, Z., et al.: I think, therefore I diffuse: enabling multimodal in-context reasoning in diffusion models. In: arXiv preprint arXiv:2502.10458 (2025)"},{"key":"1892_CR20","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning. Pmlr. pp.\u00a08821\u20138831 (2021)"},{"key":"1892_CR21","first-page":"11127","volume":"36","author":"S Zhao","year":"2023","unstructured":"Zhao, S., et al.: Uni-controlnet: all-in-one control to text-to-image diffusion models. Adv. Neural. Inf. Process. Syst. 36, 11127\u201311150 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR22","unstructured":"He, Q., et al.: DynamicControl: Adaptive Condition Selection for Improved Text-to-Image Generation. In: arXiv preprint arXiv:2412.03255 (2024)"},{"key":"1892_CR23","unstructured":"He, Q., et al.: DynamicControl: Adaptive Condition Selection for Improved Text-to-Image Generation. In: arXiv preprint arXiv:2412.03255 (2024)"},{"key":"1892_CR24","unstructured":"Wang, H., et al.: UniCombine: Unified Multi-Conditional Combination with Diffusion Transformer. In: arXiv preprint arXiv:2503.09277 (2025)"},{"issue":"240","key":"1892_CR25","first-page":"1","volume":"24","author":"A Chowdhery","year":"2023","unstructured":"Chowdhery, A., et al.: Palm: scaling language modeling with pathways. J. Mach. Learn. Res. 24(240), 1\u2013113 (2023)","journal-title":"J. Mach. Learn. Res."},{"key":"1892_CR26","first-page":"27730","volume":"35","author":"L Ouyang","year":"2022","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. Adv. Neural. Inf. Process. Syst. 35, 27730\u201327744 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR27","first-page":"36652","volume":"36","author":"Y Kirstain","year":"2023","unstructured":"Kirstain, Y., et al.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 36652\u201336663 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"1892_CR28","unstructured":"Prabhudesai, M., et al.: Aligning text-to-image diffusion models with reward backpropagation. In: arXiv preprint arXiv:2310.03739 (2023)"},{"key":"1892_CR29","unstructured":"Wu, X., et al.: Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. In: arXiv preprint arXiv:2306.09341 (2023)"},{"key":"1892_CR30","unstructured":"Shazeer, N., et al.: The sparsely-gated mixture-of-experts layer. In: Outrageously large neural networks (2017)"},{"key":"1892_CR31","unstructured":"Von Oswald, J., et al.: Continual learning with hypernetworks. In: arXiv preprint arXiv:1906.00695 (2019)"},{"key":"1892_CR32","unstructured":"Ramesh, A., et al.: Hierarchical text-conditional image generation with clip latents. In: arXiv preprint arxiv:2204.06125 1.2, p. 3 (2022)"},{"key":"1892_CR33","unstructured":"Mao, A., Mohri, M., Zhong, Y.: Cross-entropy loss functions: theoretical analysis and applications. In: International Conference on Machine Learning. PMLR. pp.\u00a023803\u201323828 (2023)"},{"key":"1892_CR34","doi-asserted-by":"crossref","unstructured":"Ren, J., et al.: Balanced mse for imbalanced visual regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. pp.\u00a07926\u20137935 (2022)","DOI":"10.1109\/CVPR52688.2022.00777"},{"key":"1892_CR35","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: arXiv preprint arXiv:2010.02502 (2020)"},{"key":"1892_CR36","unstructured":"Loshchilov, I.: Decoupled weight decay regularization. In: arXiv preprint arXiv:1711.05101 (2017)"},{"key":"1892_CR37","unstructured":"Paszke, A., et al.: Pytorch: an imperative style, high-performance deep learning library. In: Advances in Neural Information Processing Systems 32 (2019)"},{"key":"1892_CR38","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., et al.: Microsoft coco: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13. Springer, pp.\u00a0740\u2013755 (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1892_CR39","doi-asserted-by":"crossref","unstructured":"Zhang, R., et al.: The unreasonable effectiveness of deep features as a perceptual metric. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. pp.\u00a0586\u2013595 (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"1892_CR40","unstructured":"Heusel, M., et al.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. In: Advances in Neural Information Processing Systems 30 (2017)"}],"container-title":["Multimedia Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01892-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00530-025-01892-5\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00530-025-01892-5.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T09:04:12Z","timestamp":1757927052000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00530-025-01892-5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,7]]},"references-count":40,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,8]]}},"alternative-id":["1892"],"URL":"https:\/\/doi.org\/10.1007\/s00530-025-01892-5","relation":{},"ISSN":["0942-4962","1432-1882"],"issn-type":[{"type":"print","value":"0942-4962"},{"type":"electronic","value":"1432-1882"}],"subject":[],"published":{"date-parts":[[2025,7,7]]},"assertion":[{"value":"27 February 2025","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 June 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"7 July 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"302"}}