{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T15:55:28Z","timestamp":1775318128185,"version":"3.50.1"},"reference-count":48,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,6,1]],"date-time":"2026-06-01T00:00:00Z","timestamp":1780272000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["KKGD202403095"],"award-info":[{"award-number":["KKGD202403095"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Digital Signal Processing"],"published-print":{"date-parts":[[2026,6]]},"DOI":"10.1016\/j.dsp.2026.106030","type":"journal-article","created":{"date-parts":[[2026,3,2]],"date-time":"2026-03-02T20:36:54Z","timestamp":1772483814000},"page":"106030","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["PoseFA: Pose-aligned diffusion for realistic human image generation"],"prefix":"10.1016","volume":"176","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6245-2673","authenticated-orcid":false,"given":"Meng","family":"Wang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7115-2189","authenticated-orcid":false,"given":"Bing","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7331-5596","authenticated-orcid":false,"given":"Huiling","family":"Chen","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.dsp.2026.106030_bib0001","series-title":"Advances in Neural Information Processing Systems","first-page":"405","article-title":"Pose guided person image generation","volume":"30","author":"Ma","year":"2017"},{"issue":"11","key":"10.1016\/j.dsp.2026.106030_bib0002","doi-asserted-by":"crossref","first-page":"139","DOI":"10.1145\/3422622","article-title":"Generative adversarial networks","volume":"63","author":"Goodfellow","year":"2020","journal-title":"Commun. ACM"},{"key":"10.1016\/j.dsp.2026.106030_bib0003","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0004","unstructured":"J. Song, C. Meng, S. Ermon, Denoising diffusion implicit models, (2020). arXiv preprint arXiv: 2010.02502."},{"key":"10.1016\/j.dsp.2026.106030_bib0005","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"13535","article-title":"Neural texture extraction and distribution for controllable person image synthesis","author":"Ren","year":"2022"},{"key":"10.1016\/j.dsp.2026.106030_bib0006","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5968","article-title":"Person image synthesis via denoising diffusion model","author":"Bhunia","year":"2023"},{"key":"10.1016\/j.dsp.2026.106030_bib0007","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"6420","article-title":"Coarse-to-fine latent diffusion for pose-guided person image synthesis","author":"Lu","year":"2024"},{"key":"10.1016\/j.dsp.2026.106030_bib0008","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"4173","article-title":"UPGPT: universal diffusion model for person image generation, editing and pose transfer","author":"Cheong","year":"2023"},{"key":"10.1016\/j.dsp.2026.106030_bib0009","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"15988","article-title":"HumanSD: a native skeleton-guided diffusion model for human image generation","author":"Ju","year":"2023"},{"key":"10.1016\/j.dsp.2026.106030_bib0010","series-title":"2024 IEEE International Conference on Multimedia and Expo (ICME)","first-page":"1","article-title":"DNAF: diffusion with noise-aware feature for pose-guided person image synthesis","author":"Guo","year":"2024"},{"key":"10.1016\/j.dsp.2026.106030_bib0011","series-title":"ICASSP 2025-2025 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)","first-page":"1","article-title":"One-shot learning for pose-guided person image synthesis in the wild","author":"Fan","year":"2025"},{"key":"10.1016\/j.dsp.2026.106030_bib0012","first-page":"5998","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0013","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"3836","article-title":"Adding conditional control to text-to-image diffusion models","author":"Zhang","year":"2023"},{"key":"10.1016\/j.dsp.2026.106030_bib0014","unstructured":"A. Gu, T. Dao, Mamba: linear-time sequence modeling with selective state spaces, (2023). arXiv preprint arXiv: 2312.00752."},{"key":"10.1016\/j.dsp.2026.106030_bib0015","unstructured":"D.P. Kingma, M. Welling, Auto-encoding variational bayes, (2013). arXiv preprint arXiv: 1312.6114."},{"key":"10.1016\/j.dsp.2026.106030_bib0016","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"5084","article-title":"Controllable person image synthesis with attribute-decomposed gan","author":"Men","year":"2020"},{"key":"10.1016\/j.dsp.2026.106030_bib0017","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7690","article-title":"Deep image spatial transformation for person image generation","author":"Ren","year":"2020"},{"key":"10.1016\/j.dsp.2026.106030_bib0018","doi-asserted-by":"crossref","first-page":"9584","DOI":"10.1109\/TIP.2020.3029455","article-title":"PoNA: pose-guided non-local attention for human pose transfer","volume":"29","author":"Li","year":"2020","journal-title":"IEEE Trans. Image Process."},{"key":"10.1016\/j.dsp.2026.106030_bib0019","doi-asserted-by":"crossref","DOI":"10.1016\/j.patcog.2025.111626","article-title":"FaTNET: feature-alignment transformer network for human pose transfer","volume":"165","author":"Luo","year":"2025","journal-title":"Pattern Recognit."},{"issue":"7","key":"10.1016\/j.dsp.2026.106030_bib0020","doi-asserted-by":"crossref","first-page":"4635","DOI":"10.1007\/s00371-024-03447-7","article-title":"Few-shot anime pose transfer","volume":"40","author":"Wang","year":"2024","journal-title":"Vis. Comput."},{"key":"10.1016\/j.dsp.2026.106030_bib0021","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach","year":"2022"},{"key":"10.1016\/j.dsp.2026.106030_bib0022","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume":"35","author":"Saharia","year":"2022","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0023","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"5646","article-title":"Disentangled pose and appearance guidance for multi-pose generation","author":"Xiao","year":"2025"},{"key":"10.1016\/j.dsp.2026.106030_bib0024","unstructured":"F. Shen, H. Ye, J. Zhang, C. Wang, X. Han, W. Yang, Advancing pose-guided image synthesis with progressive conditional diffusion models, (2023). arXiv preprint arXiv: 2310.06313."},{"key":"10.1016\/j.dsp.2026.106030_bib0025","unstructured":"B. Qin, W. Ye, Q. Yu, S. Tang, Y. Zhuang, Dancing Avatar: pose and text-guided human videos synthesis with image diffusion model, arXiv preprint arXiv: 2308.07749(2023)."},{"key":"10.1016\/j.dsp.2026.106030_bib0026","first-page":"65670","article-title":"Stable-pose: leveraging transformers for pose-guided text-to-image generation","volume":"37","author":"Wang","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0027","unstructured":"A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, et al., An image is worth 16x16 words: transformers for image recognition at scale, (2020). arXiv preprint arXiv: 2010.11929."},{"key":"10.1016\/j.dsp.2026.106030_bib0028","series-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","first-page":"27805","article-title":"UniPose: a unified multimodal framework for human pose comprehension, generation and editing","author":"Li","year":"2025"},{"key":"10.1016\/j.dsp.2026.106030_bib0029","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"10012","article-title":"Swin transformer: hierarchical vision transformer using shifted windows","author":"Liu","year":"2021"},{"key":"10.1016\/j.dsp.2026.106030_bib0030","first-page":"2226","article-title":"Improved techniques for training gans","volume":"29","author":"Salimans","year":"2016","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0031","series-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision","first-page":"22768","article-title":"Controllable person image synthesis with pose-constrained latent diffusion","author":"Han","year":"2023"},{"key":"10.1016\/j.dsp.2026.106030_bib0032","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"1096","article-title":"DeepFashion: powering robust clothes recognition and retrieval with rich annotations","author":"Liu","year":"2016"},{"key":"10.1016\/j.dsp.2026.106030_bib0033","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"2347","article-title":"Progressive pose attention transfer for person image generation","author":"Zhu","year":"2019"},{"key":"10.1016\/j.dsp.2026.106030_bib0034","series-title":"Proceedings of the IEEE International Conference on Computer Vision","first-page":"1116","article-title":"Scalable person re-identification: a benchmark","author":"Zheng","year":"2015"},{"key":"10.1016\/j.dsp.2026.106030_bib0035","first-page":"6626","article-title":"GAns trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0036","series-title":"Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition","first-page":"586","article-title":"The unreasonable effectiveness of deep features as a perceptual metric","author":"Zhang","year":"2018"},{"issue":"4","key":"10.1016\/j.dsp.2026.106030_bib0037","doi-asserted-by":"crossref","first-page":"600","DOI":"10.1109\/TIP.2003.819861","article-title":"Image quality assessment: from error visibility to structural similarity","volume":"13","author":"Wang","year":"2004","journal-title":"IEEE Trans. Image Process."},{"issue":"3","key":"10.1016\/j.dsp.2026.106030_bib0038","first-page":"64","article-title":"Markov processes over denumerable products of spaces, describing large systems of automata","volume":"5","author":"Vaserstein","year":"1969","journal-title":"Problemy Peredachi Informatsii"},{"key":"10.1016\/j.dsp.2026.106030_bib0039","first-page":"8024","article-title":"Pytorch: an imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"10.1016\/j.dsp.2026.106030_bib0040","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"10806","article-title":"Learning semantic person image generation by region-adaptive normalization","author":"Lv","year":"2021"},{"key":"10.1016\/j.dsp.2026.106030_bib0041","series-title":"European Conference on Computer Vision","first-page":"161","article-title":"Cross attention based style distribution for controllable person image synthesis","author":"Zhou","year":"2022"},{"key":"10.1016\/j.dsp.2026.106030_bib0042","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7713","article-title":"Exploring dual-task correlation for pose guided person image generation","author":"Zhang","year":"2022"},{"key":"10.1016\/j.dsp.2026.106030_bib0043","unstructured":"K.D.P.B.J. Adam, et al., A method for stochastic optimization, 1412 (6) (2014). arXiv preprint arXiv: 1412.6980."},{"key":"10.1016\/j.dsp.2026.106030_bib0044","series-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","first-page":"7982","article-title":"PISE: person image synthesis and editing with decoupled gan","author":"Zhang","year":"2021"},{"key":"10.1016\/j.dsp.2026.106030_bib0045","series-title":"European Conference on Computer Vision","first-page":"717","article-title":"Xinggan for person image generation","author":"Tang","year":"2020"},{"key":"10.1016\/j.dsp.2026.106030_bib0046","series-title":"International Conference on Machine Learning","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","author":"Radford","year":"2021"},{"key":"10.1016\/j.dsp.2026.106030_bib0047","series-title":"International Conference on Machine Learning","first-page":"5156","article-title":"Transformers are RNNs: fast autoregressive transformers with linear attention","author":"Katharopoulos","year":"2020"},{"key":"10.1016\/j.dsp.2026.106030_bib0048","first-page":"127181","article-title":"Demystify mamba in vision: a linear attention perspective","volume":"37","author":"Han","year":"2024","journal-title":"Adv. Neural Inf. Process. Syst."}],"container-title":["Digital Signal Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1051200426001491?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S1051200426001491?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T14:59:00Z","timestamp":1775314740000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S1051200426001491"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6]]},"references-count":48,"alternative-id":["S1051200426001491"],"URL":"https:\/\/doi.org\/10.1016\/j.dsp.2026.106030","relation":{},"ISSN":["1051-2004"],"issn-type":[{"value":"1051-2004","type":"print"}],"subject":[],"published":{"date-parts":[[2026,6]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"PoseFA: Pose-aligned diffusion for realistic human image generation","name":"articletitle","label":"Article Title"},{"value":"Digital Signal Processing","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.dsp.2026.106030","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Inc. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"106030"}}