{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:33:26Z","timestamp":1777865606605,"version":"3.51.4"},"reference-count":47,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,1]],"date-time":"2026-03-01T00:00:00Z","timestamp":1772323200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100007129","name":"Natural Science Foundation of Shandong Province","doi-asserted-by":"publisher","award":["ZR2022MF320"],"award-info":[{"award-number":["ZR2022MF320"]}],"id":[{"id":"10.13039\/501100007129","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Image and Vision Computing"],"published-print":{"date-parts":[[2026,3]]},"DOI":"10.1016\/j.imavis.2026.105923","type":"journal-article","created":{"date-parts":[[2026,1,29]],"date-time":"2026-01-29T07:36:25Z","timestamp":1769672185000},"page":"105923","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["Object-level semantic alignment for enhancing fidelity in text-to-image generation with diffusion models"],"prefix":"10.1016","volume":"167","author":[{"given":"Wenna","family":"Liu","sequence":"first","affiliation":[]},{"given":"Na","family":"Tian","sequence":"additional","affiliation":[]},{"given":"Youjia","family":"Shao","sequence":"additional","affiliation":[]},{"given":"Wencang","family":"Zhao","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.imavis.2026.105923_b1","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2024.128965","article-title":"Content-aware preserving image generation","volume":"617","author":"Le","year":"2025","journal-title":"Neurocomputing"},{"key":"10.1016\/j.imavis.2026.105923_b2","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105138","article-title":"Localization of diffusion model-based inpainting through the inter-intra similarity of frequency features","volume":"148","author":"Lee","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105923_b3","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105097","article-title":"Enhancing consistency in virtual try-on: A novel diffusion-based approach","volume":"148","author":"Zhou","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105923_b4","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105254","article-title":"Diff-STAR: Exploring student-teacher adaptive reconstruction through diffusion-based generation for image harmonization","volume":"151","author":"Cao","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105923_b5","doi-asserted-by":"crossref","DOI":"10.1016\/j.neunet.2024.106629","article-title":"Mask-shift-inference: A novel paradigm for domain generalization","volume":"179","author":"Shao","year":"2024","journal-title":"Neural Netw."},{"key":"10.1016\/j.imavis.2026.105923_b6","doi-asserted-by":"crossref","DOI":"10.1016\/j.asoc.2025.112910","article-title":"CaRGI: Causal semantic representation learning via generative intervention for single domain generalization","author":"Shao","year":"2025","journal-title":"Appl. Soft Comput."},{"key":"10.1016\/j.imavis.2026.105923_b7","article-title":"Learning discriminative topological structure information representation for 2D shape and social network classification via persistent homology","author":"Wang","year":"2025","journal-title":"Knowl.-Based Syst."},{"key":"10.1016\/j.imavis.2026.105923_b8","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b9","series-title":"Pseudo numerical methods for diffusion models on manifolds","author":"Liu","year":"2022"},{"key":"10.1016\/j.imavis.2026.105923_b10","first-page":"5775","article-title":"Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps","volume":"35","author":"Lu","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b11","series-title":"Denoising diffusion implicit models","author":"Song","year":"2020"},{"key":"10.1016\/j.imavis.2026.105923_b12","article-title":"Generative adversarial nets","volume":"27","author":"Goodfellow","year":"2014","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b13","doi-asserted-by":"crossref","unstructured":"R. Abdal, Y. Qin, P. Wonka, Image2stylegan: How to embed images into the stylegan latent space?, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2019, pp. 4432\u20134441.","DOI":"10.1109\/ICCV.2019.00453"},{"key":"10.1016\/j.imavis.2026.105923_b14","first-page":"8780","article-title":"Diffusion models beat gans on image synthesis","volume":"34","author":"Dhariwal","year":"2021","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b15","series-title":"Hierarchical text-conditional image generation with clip latents","first-page":"3","author":"Ramesh","year":"2022"},{"key":"10.1016\/j.imavis.2026.105923_b16","doi-asserted-by":"crossref","DOI":"10.1016\/j.imavis.2024.105296","article-title":"Parameter efficient finetuning of text-to-image models with trainable self-attention layer","volume":"151","author":"Li","year":"2024","journal-title":"Image Vis. Comput."},{"key":"10.1016\/j.imavis.2026.105923_b17","series-title":"International Conference on Machine Learning","first-page":"8162","article-title":"Improved denoising diffusion probabilistic models","author":"Nichol","year":"2021"},{"key":"10.1016\/j.imavis.2026.105923_b18","series-title":"Muse: Text-to-image generation via masked generative transformers","author":"Chang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105923_b19","doi-asserted-by":"crossref","unstructured":"R. Rombach, A. Blattmann, D. Lorenz, P. Esser, B. Ommer, High-resolution image synthesis with latent diffusion models, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2022, pp. 10684\u201310695.","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"10.1016\/j.imavis.2026.105923_b20","series-title":"Divide & bind your attention for improved generative semantic nursing","author":"Li","year":"2023"},{"key":"10.1016\/j.imavis.2026.105923_b21","series-title":"International Conference on Machine Learning","first-page":"8821","article-title":"Zero-shot text-to-image generation","author":"Ramesh","year":"2021"},{"key":"10.1016\/j.imavis.2026.105923_b22","series-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021"},{"key":"10.1016\/j.imavis.2026.105923_b23","series-title":"An efficient encoder-decoder architecture with top-down attention for speech separation","author":"Li","year":"2022"},{"key":"10.1016\/j.imavis.2026.105923_b24","series-title":"Training-free structured diffusion guidance for compositional text-to-image synthesis","author":"Feng","year":"2022"},{"issue":"4","key":"10.1016\/j.imavis.2026.105923_b25","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3592116","article-title":"Attend-and-excite: Attention-based semantic guidance for text-to-image diffusion models","volume":"42","author":"Chefer","year":"2023","journal-title":"ACM Trans. Graph."},{"key":"10.1016\/j.imavis.2026.105923_b26","article-title":"Linguistic binding in diffusion models: Enhancing attribute correspondence through attention map alignment","volume":"36","author":"Rassin","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b27","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b28","series-title":"International Conference on Machine Learning","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","author":"Sohl-Dickstein","year":"2015"},{"key":"10.1016\/j.imavis.2026.105923_b29","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"issue":"4","key":"10.1016\/j.imavis.2026.105923_b30","doi-asserted-by":"crossref","first-page":"307","DOI":"10.1561\/2200000056","article-title":"An introduction to variational autoencoders","volume":"12","author":"Kingma","year":"2019","journal-title":"Found. Trends\u00ae Mach. Learn."},{"key":"10.1016\/j.imavis.2026.105923_b31","first-page":"19667","article-title":"NVAE: A deep hierarchical variational autoencoder","volume":"33","author":"Vahdat","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b32","doi-asserted-by":"crossref","unstructured":"T. Karras, S. Laine, M. Aittala, J. Hellsten, J. Lehtinen, T. Aila, Analyzing and improving the image quality of stylegan, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2020, pp. 8110\u20138119.","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"10.1016\/j.imavis.2026.105923_b33","doi-asserted-by":"crossref","unstructured":"R. Abdal, P. Zhu, J. Femiani, N. Mitra, P. Wonka, Clip2stylegan: Unsupervised extraction of stylegan edit directions, in: ACM SIGGRAPH 2022 Conference Proceedings, 2022, pp. 1\u20139.","DOI":"10.1145\/3528233.3530747"},{"key":"10.1016\/j.imavis.2026.105923_b34","series-title":"Hierarchical text-conditional image generation with clip latents","first-page":"3","author":"Ramesh","year":"2022"},{"key":"10.1016\/j.imavis.2026.105923_b35","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.imavis.2026.105923_b36","series-title":"Sdxl: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","year":"2023"},{"issue":"3","key":"10.1016\/j.imavis.2026.105923_b37","first-page":"8","article-title":"Improving image generation with better captions","volume":"2","author":"Betker","year":"2023","journal-title":"Comput. Sci."},{"key":"10.1016\/j.imavis.2026.105923_b38","doi-asserted-by":"crossref","unstructured":"N. Ruiz, Y. Li, V. Jampani, Y. Pritch, M. Rubinstein, K. Aberman, Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation, in: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, 2023, pp. 22500\u201322510.","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"10.1016\/j.imavis.2026.105923_b39","series-title":"International Conference on Machine Learning","first-page":"1060","article-title":"Generative adversarial text to image synthesis","author":"Reed","year":"2016"},{"key":"10.1016\/j.imavis.2026.105923_b40","doi-asserted-by":"crossref","unstructured":"H. Zhang, T. Xu, H. Li, S. Zhang, X. Wang, X. Huang, D.N. Metaxas, Stackgan: Text to photo-realistic image synthesis with stacked generative adversarial networks, in: Proceedings of the IEEE International Conference on Computer Vision, 2017, pp. 5907\u20135915.","DOI":"10.1109\/ICCV.2017.629"},{"key":"10.1016\/j.imavis.2026.105923_b41","doi-asserted-by":"crossref","unstructured":"T. Xu, P. Zhang, Q. Huang, H. Zhang, Z. Gan, X. Huang, X. He, Attngan: Fine-grained text to image generation with attentional generative adversarial networks, in: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2018, pp. 1316\u20131324.","DOI":"10.1109\/CVPR.2018.00143"},{"key":"10.1016\/j.imavis.2026.105923_b42","series-title":"Prompt-to-prompt image editing with cross attention control","author":"Hertz","year":"2022"},{"issue":"1","key":"10.1016\/j.imavis.2026.105923_b43","first-page":"411","article-title":"spaCy 2: Natural language understanding with bloom embeddings, convolutional neural networks and incremental parsing","volume":"7","author":"Honnibal","year":"2017"},{"key":"10.1016\/j.imavis.2026.105923_b44","series-title":"Clipscore: A reference-free evaluation metric for image captioning","author":"Hessel","year":"2021"},{"key":"10.1016\/j.imavis.2026.105923_b45","doi-asserted-by":"crossref","unstructured":"Y. Hu, B. Liu, J. Kasai, Y. Wang, M. Ostendorf, R. Krishna, N.A. Smith, Tifa: Accurate and interpretable text-to-image faithfulness evaluation with question answering, in: Proceedings of the IEEE\/CVF International Conference on Computer Vision, 2023, pp. 20406\u201320417.","DOI":"10.1109\/ICCV51070.2023.01866"},{"key":"10.1016\/j.imavis.2026.105923_b46","series-title":"Proceedings of the AAAI Conference on Artificial Intelligence","first-page":"2555","article-title":"Exploring clip for assessing the look and feel of images","volume":"vol. 37","author":"Wang","year":"2023"},{"key":"10.1016\/j.imavis.2026.105923_b47","series-title":"Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13","first-page":"740","article-title":"Microsoft coco: Common objects in context","author":"Lin","year":"2014"}],"container-title":["Image and Vision Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000296?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0262885626000296?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T07:00:44Z","timestamp":1777532444000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0262885626000296"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3]]},"references-count":47,"alternative-id":["S0262885626000296"],"URL":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105923","relation":{},"ISSN":["0262-8856"],"issn-type":[{"value":"0262-8856","type":"print"}],"subject":[],"published":{"date-parts":[[2026,3]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Object-level semantic alignment for enhancing fidelity in text-to-image generation with diffusion models","name":"articletitle","label":"Article Title"},{"value":"Image and Vision Computing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.imavis.2026.105923","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"105923"}}