{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,4]],"date-time":"2025-09-04T13:22:40Z","timestamp":1756992160185,"version":"3.37.3"},"reference-count":96,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100004837","name":"Ministerio de Ciencia e Innovaci?n","doi-asserted-by":"publisher","award":["TED2021-131535B-I00"],"award-info":[{"award-number":["TED2021-131535B-I00"]}],"id":[{"id":"10.13039\/501100004837","id-type":"DOI","asserted-by":"publisher"}]},{"name":"European Union ?NextGenerationEU?"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2023]]},"DOI":"10.1109\/access.2023.3312017","type":"journal-article","created":{"date-parts":[[2023,9,4]],"date-time":"2023-09-04T18:13:18Z","timestamp":1693851198000},"page":"99111-99129","source":"Crossref","is-referenced-by-count":3,"title":["RGB-D-Fusion: Image Conditioned Depth Diffusion of Humanoid Subjects"],"prefix":"10.1109","volume":"11","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5578-7555","authenticated-orcid":false,"given":"Sascha","family":"Kirch","sequence":"first","affiliation":[{"name":"Electric and Computer Engineering Department, UNED-National University of Distance Education, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9766-5057","authenticated-orcid":false,"given":"Valeria","family":"Olyunina","sequence":"additional","affiliation":[{"name":"Volograms Ltd., Dublin 8, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5409-1521","authenticated-orcid":false,"given":"Jan","family":"Ond\u0159ej","sequence":"additional","affiliation":[{"name":"Volograms Ltd., Dublin 8, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5691-9580","authenticated-orcid":false,"given":"Rafael","family":"Pag\u00e9s","sequence":"additional","affiliation":[{"name":"Volograms Ltd., Dublin 8, Ireland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4118-0234","authenticated-orcid":false,"given":"Sergio","family":"Mart\u00edn","sequence":"additional","affiliation":[{"name":"Electric and Computer Engineering Department, UNED-National University of Distance Education, Madrid, Spain"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8260-4155","authenticated-orcid":false,"given":"Clara","family":"P\u00e9rez-Molina","sequence":"additional","affiliation":[{"name":"Electric and Computer Engineering Department, UNED-National University of Distance Education, Madrid, Spain"}]}],"member":"263","reference":[{"key":"ref13","article-title":"Diffusion models beat GANs on image synthesis","author":"dhariwal","year":"2021","journal-title":"arXiv 2105 05233"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.12.089"},{"key":"ref12","article-title":"Improved denoising diffusion probabilistic models","author":"nichol","year":"2021","journal-title":"arXiv 2102 09672"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/s11431-020-1582-8"},{"key":"ref15","article-title":"Cascaded diffusion models for high fidelity image generation","author":"ho","year":"2021","journal-title":"arXiv 2106 15282"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref14","article-title":"Palette: Image-to-image diffusion models","author":"saharia","year":"2021","journal-title":"arXiv 2111 05826"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.3390\/s22145353"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01410"},{"key":"ref52","article-title":"Adding conditional control to text-to-image diffusion models","author":"zhang","year":"2023","journal-title":"arXiv 2302 05543"},{"key":"ref96","article-title":"A note on the evaluation of generative models","author":"theis","year":"2016","journal-title":"arXiv 1511 01844"},{"key":"ref11","article-title":"Denoising diffusion probabilistic models","author":"ho","year":"2020","journal-title":"arXiv 2006 11239"},{"key":"ref55","article-title":"Monocular depth estimation: A survey","author":"bhoi","year":"2019","journal-title":"arXiv 1901 09402"},{"key":"ref10","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","author":"sohl-dickstein","year":"2015","journal-title":"arXiv 1503 03585"},{"key":"ref54","article-title":"Classifier-free diffusion guidance","author":"ho","year":"2022","journal-title":"arXiv 2207 12598"},{"key":"ref17","article-title":"Video diffusion models","author":"ho","year":"2022","journal-title":"arXiv 2204 03458"},{"key":"ref16","article-title":"Image super-resolution via iterative refinement","author":"saharia","year":"2021","journal-title":"arXiv 2104 07636"},{"key":"ref19","article-title":"Diffusion probabilistic modeling for video generation","author":"yang","year":"2022","journal-title":"arXiv 2203 09481"},{"key":"ref18","article-title":"Flexible diffusion modeling of long videos","author":"harvey","year":"2022","journal-title":"arXiv 2205 11495"},{"key":"ref93","article-title":"Gaussian error linear units (GELUs)","author":"hendrycks","year":"2016","journal-title":"arXiv 1606 08415"},{"key":"ref92","article-title":"Layer normalization","author":"ba","year":"2016","journal-title":"arXiv 1607 06450"},{"key":"ref51","article-title":"A style-based generator architecture for generative adversarial networks","author":"karras","year":"2018","journal-title":"arXiv 1812 04948"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_39"},{"key":"ref50","article-title":"Label-efficient semantic segmentation with diffusion models","author":"baranchuk","year":"2021","journal-title":"arXiv 2112 03126"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"ref90","article-title":"Instance normalization: The missing ingredient for fast stylization","author":"ulyanov","year":"2016","journal-title":"arXiv 1607 08022"},{"key":"ref46","article-title":"Elucidating the design space of diffusion-based generative models","author":"karras","year":"2022","journal-title":"arXiv 2206 00364"},{"key":"ref45","article-title":"Improved techniques for training score-based generative models","author":"song","year":"2020","journal-title":"arXiv 2006 09011"},{"key":"ref89","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"arXiv 1502 03167"},{"key":"ref48","article-title":"SDEdit: Guided image synthesis and editing with stochastic differential equations","author":"meng","year":"2021","journal-title":"arXiv 2108 01073"},{"key":"ref47","article-title":"Conditional image generation with score-based diffusion models","author":"batzolis","year":"2021","journal-title":"arXiv 2111 13606"},{"key":"ref42","article-title":"Improved vector quantized diffusion models","author":"tang","year":"2022","journal-title":"arXiv 2205 16007"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053405"},{"key":"ref41","article-title":"Vector quantized diffusion model for text-to-image synthesis","author":"gu","year":"2021","journal-title":"arXiv 2111 14822"},{"key":"ref85","article-title":"Variational diffusion models","author":"kingma","year":"2021","journal-title":"arXiv 2107 00630"},{"key":"ref44","article-title":"Maximum likelihood training of score-based diffusion models","author":"song","year":"2021","journal-title":"arXiv 2101 09258"},{"key":"ref88","article-title":"Linformer: Self-attention with linear complexity","author":"wang","year":"2020","journal-title":"arXiv 2006 04768"},{"key":"ref43","article-title":"Score-based generative modeling through stochastic differential equations","author":"song","year":"2020","journal-title":"arXiv 2011 13456"},{"key":"ref87","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"arXiv 1706 03762"},{"key":"ref49","article-title":"DiffusionDepth: Diffusion denoising approach for monocular depth estimation","author":"duan","year":"2023","journal-title":"arXiv 2303 05021"},{"article-title":"Volograms & V-SENSE volumetric video dataset","year":"2021","author":"pag\u00e9s","key":"ref8"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.56541\/SOWW6683"},{"key":"ref9","article-title":"Multimodal image synthesis and editing: The generative AI era","author":"zhan","year":"2021","journal-title":"arXiv 2112 13592"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00239"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/j.jvcir.2018.03.012"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01294"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00016"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00880"},{"key":"ref81","article-title":"DreamFusion: Text-to-3D using 2D diffusion","author":"poole","year":"2022","journal-title":"arXiv 2209 14988"},{"key":"ref40","article-title":"D2C: Diffusion-denoising models for few-shot conditional generation","author":"sinha","year":"2021","journal-title":"arXiv 2106 06819"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01118"},{"key":"ref83","article-title":"DAG: Depth-aware guidance with denoising diffusion probabilistic models","author":"kim","year":"2022","journal-title":"arXiv 2212 08861"},{"key":"ref80","article-title":"Infinite nature: Perpetual view generation of natural scenes from a single image","author":"liu","year":"2020","journal-title":"arXiv 2012 09855"},{"key":"ref35","article-title":"Denoising diffusion implicit models","author":"song","year":"2020","journal-title":"arXiv 2010 02502"},{"key":"ref79","article-title":"Novel view synthesis with diffusion models","author":"watson","year":"2022","journal-title":"arXiv 2210 04628"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.2139\/ssrn.4184452"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01384"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01209"},{"key":"ref36","article-title":"Progressive distillation for fast sampling of diffusion models","author":"salimans","year":"2022","journal-title":"arXiv 2202 00512"},{"key":"ref31","article-title":"Neural spline flows","author":"durkan","year":"2019","journal-title":"arXiv 1906 04032"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00577"},{"key":"ref30","article-title":"Density estimation using real NVP","author":"dinh","year":"2016","journal-title":"arXiv 1605 08803"},{"key":"ref74","article-title":"ZoeDepth: Zero-shot transfer by combining relative and metric depth","author":"bhat","year":"2023","journal-title":"arXiv 2302 12288"},{"key":"ref33","article-title":"Large scale GAN training for high fidelity natural image synthesis","author":"brock","year":"2018","journal-title":"arXiv 1809 11096"},{"key":"ref77","article-title":"Point-E: A system for generating 3D point clouds from complex prompts","author":"nichol","year":"2022","journal-title":"arXiv 2212 08751"},{"key":"ref32","article-title":"Generative adversarial networks","author":"goodfellow","year":"2014","journal-title":"arXiv 1406 2661"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00286"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2766945"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356571"},{"key":"ref39","article-title":"High-resolution image synthesis with latent diffusion models","author":"rombach","year":"2021","journal-title":"arXiv 2112 10752"},{"key":"ref38","article-title":"Diffusion autoencoders: Toward a meaningful and decodable representation","author":"preechakul","year":"2021","journal-title":"arXiv 2111 15640"},{"key":"ref71","article-title":"Monocular depth estimation using diffusion models","author":"saxena","year":"2023","journal-title":"arXiv 2302 14816"},{"key":"ref70","article-title":"Dynamic fusion network for light field depth estimation","author":"piao","year":"2021","journal-title":"arXiv 2104 05969"},{"key":"ref73","article-title":"Towards robust monocular depth estimation: Mixing datasets for zero-shot cross-dataset transfer","author":"ranftl","year":"2019","journal-title":"arXiv 1907 01341"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref24","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","author":"saharia","year":"2022","journal-title":"arXiv 2205 11487"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2929170"},{"key":"ref23","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","author":"nichol","year":"2021","journal-title":"arXiv 2112 10741"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1016\/j.isprsjprs.2021.11.003"},{"key":"ref26","article-title":"Tackling the generative learning trilemma with denoising diffusion GANs","author":"xiao","year":"2021","journal-title":"arXiv 2112 07804"},{"key":"ref25","article-title":"MM-diffusion: Learning multi-modal diffusion models for joint audio and video generation","author":"ruan","year":"2022","journal-title":"arXiv 2212 09478"},{"key":"ref69","article-title":"Pseudo-LiDAR point cloud interpolation based on 3D motion representation and spatial supervision","author":"liu","year":"2020","journal-title":"arXiv 2006 11481"},{"key":"ref20","article-title":"DiffWave: A versatile diffusion model for audio synthesis","author":"kong","year":"2020","journal-title":"arXiv 2009 09761"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3222641"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1016\/j.engappai.2022.104862"},{"key":"ref22","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"ramesh","year":"2022","journal-title":"arXiv 2204 06125"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01484-6"},{"key":"ref21","article-title":"PriorGrad: Improving conditional denoising diffusion models with data-dependent adaptive prior","author":"lee","year":"2021","journal-title":"arXiv 2106 06406"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00122"},{"key":"ref28","article-title":"Auto-encoding variational Bayes","author":"kingma","year":"2013","journal-title":"arXiv 1312 6114"},{"key":"ref27","article-title":"A novel sampling scheme for text- and image-conditional image synthesis in quantized latent spaces","author":"rampas","year":"2022","journal-title":"arXiv 2211 07292"},{"key":"ref29","article-title":"Generating diverse high-fidelity images with VQ-VAE-2","author":"razavi","year":"2019","journal-title":"arXiv 1906 00446"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.11.071"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2020.05.015"},{"key":"ref61","article-title":"Self-supervised monocular image depth learning and confidence estimation","author":"chen","year":"2018","journal-title":"arXiv 1803 05530"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/10005208\/10239167.pdf?arnumber=10239167","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,9]],"date-time":"2023-10-09T19:26:38Z","timestamp":1696879598000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10239167\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"references-count":96,"URL":"https:\/\/doi.org\/10.1109\/access.2023.3312017","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2023]]}}}