{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:14:21Z","timestamp":1765340061680,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China under Grants","award":["62406039, 62321001, 62471055, U23B2001, 62171057, 62201072, 62071067"],"award-info":[{"award-number":["62406039, 62321001, 62471055, U23B2001, 62171057, 62201072, 62071067"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["2024PTB-004"],"award-info":[{"award-number":["2024PTB-004"]}]},{"name":"the Postdoctoral Fellowship Program and China Postdoctoral Science Foundation under Grants","award":["2023TQ0039, 2024M750257, GZC20230320"],"award-info":[{"award-number":["2023TQ0039, 2024M750257, GZC20230320"]}]},{"name":"the High-Quality Development Project of the MIIT","award":["2440STCZB2584"],"award-info":[{"award-number":["2440STCZB2584"]}]},{"name":"the Ministry of Education and China Mobile Joint Fund","award":["MCM20200202, MCM20180101"],"award-info":[{"award-number":["MCM20200202, MCM20180101"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755290","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:26:51Z","timestamp":1761377211000},"page":"10006-10014","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["A Dual-Branch 3D Spatial-Aware Latent Diffusion for Realistic Depth Image Synthesis"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-8561-7387","authenticated-orcid":false,"given":"Shuang","family":"Hao","sequence":"first","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1691-6457","authenticated-orcid":false,"given":"Pengfei","family":"Ren","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3429-6692","authenticated-orcid":false,"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Cloud Network, China Unicom Network Communications Co., Ltd., Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3072-7422","authenticated-orcid":false,"given":"Haifeng","family":"Sun","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2504-7426","authenticated-orcid":false,"given":"Pan","family":"Ting","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, BeiJing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9713-9938","authenticated-orcid":false,"given":"Menghao","family":"Zhang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0184-5965","authenticated-orcid":false,"given":"Cong","family":"Liu","sequence":"additional","affiliation":[{"name":"China Mobile Research Institute, China Mobile Communications Company Limited Research Institute, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0829-4624","authenticated-orcid":false,"given":"Qi","family":"Qi","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1486-0573","authenticated-orcid":false,"given":"Jianxin","family":"Liao","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2182-2228","authenticated-orcid":false,"given":"JingYu","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing University of Post and Telecommunication, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"707","article-title":"Analysis and noise modeling of the intel realsense d435 for mobile robots","author":"Ahn Min Sung","year":"2019","unstructured":"Min Sung Ahn, Hosik Chae, Donghun Noh, Hyunwoo Nam, and Dennis Hong. 2019. Analysis and noise modeling of the intel realsense d435 for mobile robots. In UR. IEEE, 707-711.","journal-title":"UR. IEEE"},{"key":"e_1_3_2_1_2_1","first-page":"85","article-title":"Measuring generalisation to unseen viewpoints, articulations, shapes and objects for 3D hand pose estimation under hand-object interaction","author":"Armagan Anil","year":"2020","unstructured":"Anil Armagan, Guillermo Garcia-Hernando, Seungryul Baek, Shreyas Hampali, Mahdi Rad, Zhaohui Zhang, Shipeng Xie, MingXiu Chen, Boshen Zhang, Fu Xiong, et al., 2020. Measuring generalisation to unseen viewpoints, articulations, shapes and objects for 3D hand pose estimation under hand-object interaction. In ECCV. Springer, 85-101.","journal-title":"ECCV. Springer"},{"key":"e_1_3_2_1_3_1","first-page":"2262","article-title":"Diffusion-sdf: Conditional generative modeling of signed distance functions","author":"Chou Gene","year":"2023","unstructured":"Gene Chou, Yuval Bahat, and Felix Heide. 2023. Diffusion-sdf: Conditional generative modeling of signed distance functions. In ICCV. 2262-2272.","journal-title":"ICCV."},{"key":"e_1_3_2_1_4_1","volume-title":"NIPS","volume":"29","author":"Dosovitskiy Alexey","year":"2016","unstructured":"Alexey Dosovitskiy and Thomas Brox. 2016. Generating images with perceptual similarity metrics based on deep networks. In NIPS, Vol. 29 (2016)."},{"key":"e_1_3_2_1_5_1","first-page":"12873","article-title":"Taming transformers for high-resolution image synthesis","author":"Esser Patrick","year":"2021","unstructured":"Patrick Esser, Robin Rombach, and Bjorn Ommer. 2021. Taming transformers for high-resolution image synthesis. In CVPR. 12873-12883.","journal-title":"CVPR."},{"key":"e_1_3_2_1_6_1","volume-title":"Adaptive Multi-Modal Control of Digital Human Hand Synthesis Using a Region-Aware Cycle Loss. In arXiv:2409.09149","author":"Fu Qifan","year":"2024","unstructured":"Qifan Fu, Xiaohang Yang, Muhammad Asad, Changjae Oh, Shanxin Yuan, and Gregory Slabaugh. 2024. Adaptive Multi-Modal Control of Digital Human Hand Synthesis Using a Region-Aware Cycle Loss. In arXiv:2409.09149 (2024)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i3.27946"},{"key":"e_1_3_2_1_8_1","first-page":"22768","article-title":"Controllable person image synthesis with pose-constrained latent diffusion","author":"Han Xiao","year":"2023","unstructured":"Xiao Han, Xiatian Zhu, Jiankang Deng, Yi-Zhe Song, and Tao Xiang. 2023. Controllable person image synthesis with pose-constrained latent diffusion. In ICCV. 22768-22777.","journal-title":"ICCV."},{"key":"e_1_3_2_1_9_1","volume-title":"NIPS","volume":"30","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. In NIPS, Vol. 30 (2017)."},{"key":"e_1_3_2_1_10_1","first-page":"6840","volume-title":"NIPS","volume":"33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In NIPS, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_11_1","volume-title":"NIPS","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. In NIPS (2022)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6761"},{"key":"e_1_3_2_1_13_1","first-page":"1125","article-title":"Image-to-image translation with conditional adversarial networks","author":"Isola Phillip","year":"2017","unstructured":"Phillip Isola, Jun-Yan Zhu, Tinghui Zhou, and Alexei A Efros. 2017. Image-to-image translation with conditional adversarial networks. In CVPR. 1125-1134.","journal-title":"CVPR."},{"key":"e_1_3_2_1_14_1","first-page":"15988","article-title":"Humansd: A native skeleton-guided diffusion model for human image generation","author":"Ju Xuan","year":"2023","unstructured":"Xuan Ju, Ailing Zeng, Chenchen Zhao, Jianan Wang, Lei Zhang, and Qiang Xu. 2023. Humansd: A native skeleton-guided diffusion model for human image generation. In ICCV. 15988-15998.","journal-title":"ICCV."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i12.29199"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2015.2494877"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2656463"},{"key":"e_1_3_2_1_18_1","volume-title":"Hyperhuman: Hyper-realistic human generation with latent structural diffusion. In arXiv:2310.08579","author":"Liu Xian","year":"2023","unstructured":"Xian Liu, Jian Ren, Aliaksandr Siarohin, Ivan Skorokhodov, Yanyu Li, Dahua Lin, Xihui Liu, Ziwei Liu, and Sergey Tulyakov. 2023. Hyperhuman: Hyper-realistic human generation with latent structural diffusion. In arXiv:2310.08579 (2023)."},{"key":"e_1_3_2_1_19_1","first-page":"6420","article-title":"Coarse-to-fine latent diffusion for pose-guided person image synthesis","author":"Lu Yanzuo","year":"2024","unstructured":"Yanzuo Lu, Manlin Zhang, Andy J Ma, Xiaohua Xie, and Jianhuang Lai. 2024. Coarse-to-fine latent diffusion for pose-guided person image synthesis. In CVPR. 6420-6429.","journal-title":"CVPR."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSEN.2014.2309987"},{"key":"e_1_3_2_1_21_1","volume-title":"Pyrender: A pure Python, real-time, physically based renderer for 3D visualization. https:\/\/github.com\/mmatl\/pyrender. Accessed: 2024-11-13.","author":"Matl Matthew","year":"2019","unstructured":"Matthew Matl. 2019. Pyrender: A pure Python, real-time, physically based renderer for 3D visualization. https:\/\/github.com\/mmatl\/pyrender. Accessed: 2024-11-13."},{"volume-title":"Contrastive learning for unpaired image-to-image translation","author":"Park Taesung","key":"e_1_3_2_1_22_1","unstructured":"Taesung Park, Alexei A Efros, Richard Zhang, and Jun-Yan Zhu. 2020. Contrastive learning for unpaired image-to-image translation. In ECCV. Springer, 319-345."},{"key":"e_1_3_2_1_23_1","volume-title":"Ozge Mercanoglu Sincan, and Richard Bowden","author":"Pelykh Anton","year":"2024","unstructured":"Anton Pelykh, Ozge Mercanoglu Sincan, and Richard Bowden. 2024. Giving a Hand to Diffusion Models: a Two-Stage Approach to Improving Conditional Human Image Generation. arXiv:2403.10731 (2024)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25310"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3192708"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.01.045"},{"key":"e_1_3_2_1_27_1","first-page":"10684","article-title":"High-resolution image synthesis with latent diffusion models","author":"Rombach Robin","year":"2022","unstructured":"Robin Rombach, Andreas Blattmann, Dominik Lorenz, Patrick Esser, and Bj\u00f6rn Ommer. 2022. High-resolution image synthesis with latent diffusion models. In CVPR. 10684-10695.","journal-title":"CVPR."},{"key":"e_1_3_2_1_28_1","volume-title":"Embodied hands: Modeling and capturing hands and bodies together. In arXiv:2201.02610","author":"Romero Javier","year":"2022","unstructured":"Javier Romero, Dimitrios Tzionas, and Michael J Black. 2022. Embodied hands: Modeling and capturing hands and bodies together. In arXiv:2201.02610 (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"U-net: Convolutional networks for biomedical image segmentation","author":"Ronneberger Olaf","year":"2015","unstructured":"Olaf Ronneberger, Philipp Fischer, and Thomas Brox. 2015. U-net: Convolutional networks for biomedical image segmentation. In MICCAI. Springer, 234-241."},{"key":"e_1_3_2_1_30_1","first-page":"5400","article-title":"Scribbler: Controlling deep image synthesis with sketch and color","author":"Sangkloy Patsorn","year":"2017","unstructured":"Patsorn Sangkloy, Jingwan Lu, Chen Fang, Fisher Yu, and James Hays. 2017. Scribbler: Controlling deep image synthesis with sketch and color. In CVPR. 5400-5409.","journal-title":"CVPR."},{"key":"e_1_3_2_1_31_1","first-page":"4845","article-title":"Dcl: Differential contrastive learning for geometry-aware depth synthesis","volume":"7","author":"Shen Yuefan","year":"2022","unstructured":"Yuefan Shen, Yanchao Yang, Youyi Zheng, C Karen Liu, and Leonidas J Guibas. 2022. Dcl: Differential contrastive learning for geometry-aware depth synthesis. In RAL, Vol. 7, 2 (2022), 4845-4852.","journal-title":"RAL"},{"key":"e_1_3_2_1_32_1","volume-title":"Alykhan Tejani, and Tae-Kyun Kim.","author":"Tang Danhang","year":"2014","unstructured":"Danhang Tang, Hyung Jin Chang, Alykhan Tejani, and Tae-Kyun Kim. 2014. Latent Regression Forest: Structured Estimation of 3D Articulated Hand Posture. In CVPR (2014), 3786-3793. https:\/\/api.semanticscholar.org\/CorpusID:4632519"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/2629500"},{"key":"e_1_3_2_1_34_1","first-page":"4604","article-title":"Pointpainting: Sequential fusion for 3d object detection","author":"Vora Sourabh","year":"2020","unstructured":"Sourabh Vora, Alex H Lang, Bassam Helou, and Oscar Beijbom. 2020. Pointpainting: Sequential fusion for 3d object detection. In CVPR. 4604-4612.","journal-title":"CVPR."},{"key":"e_1_3_2_1_35_1","first-page":"11794","article-title":"Pointaugmenting: Cross-modal augmentation for 3d object detection","author":"Wang Chunwei","year":"2021","unstructured":"Chunwei Wang, Chao Ma, Ming Zhu, and Xiaokang Yang. 2021. Pointaugmenting: Cross-modal augmentation for 3d object detection. In CVPR. 11794-11803.","journal-title":"CVPR."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_2_1_37_1","first-page":"20406","article-title":"SpatialTracker: Tracking Any 2D Pixels in 3D Space","author":"Xiao Yuxi","year":"2024","unstructured":"Yuxi Xiao, Qianqian Wang, Shangzhan Zhang, Nan Xue, Sida Peng, Yujun Shen, and Xiaowei Zhou. 2024. SpatialTracker: Tracking Any 2D Pixels in 3D Space. In CVPR. 20406-20417.","journal-title":"CVPR."},{"key":"e_1_3_2_1_38_1","first-page":"28545","volume-title":"NIPS","volume":"35","author":"Xie Shaoan","year":"2022","unstructured":"Shaoan Xie, Qirong Ho, and Kun Zhang. 2022. Unsupervised image-to-image translation with density changing regularization. NIPS, Vol. 35 (2022), 28545-28558."},{"key":"e_1_3_2_1_39_1","first-page":"793","article-title":"A2j: Anchor-to-joint regression network for 3d articulated pose estimation from a single depth image","author":"Xiong Fu","year":"2019","unstructured":"Fu Xiong, Boshen Zhang, Yang Xiao, Zhiguo Cao, Taidong Yu, Joey Tianyi Zhou, and Junsong Yuan. 2019. A2j: Anchor-to-joint regression network for 3d articulated pose estimation from a single depth image. In ICCV. 793-802.","journal-title":"ICCV."},{"key":"e_1_3_2_1_40_1","volume-title":"Yu Zhang, and Li Cheng.","author":"Xu Chi","year":"2016","unstructured":"Chi Xu, Lakshmi Narasimhan Govindarajan, Yu Zhang, and Li Cheng. 2016. Lie-X: Depth Image Based Articulated Object Pose Estimation, Tracking, and Action Recognition on Lie Groups. arXiv:1609.03773 [cs.CV] https:\/\/arxiv.org\/abs\/1609.03773"},{"volume-title":"A shadow repair approach for kinect depth maps","author":"Yu Yu","key":"e_1_3_2_1_41_1","unstructured":"Yu Yu, Yonghong Song, Yuanlin Zhang, and Shu Wen. 2013. A shadow repair approach for kinect depth maps. In ACCV. Springer, 615-626."},{"key":"e_1_3_2_1_42_1","first-page":"3836","article-title":"Adding conditional control to text-to-image diffusion models","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. 2023a. Adding conditional control to text-to-image diffusion models. In ICCV. 3836-3847.","journal-title":"ICCV."},{"key":"e_1_3_2_1_43_1","first-page":"3836","article-title":"Adding conditional control to text-to-image diffusion models","author":"Zhang Lvmin","year":"2023","unstructured":"Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. 2023b. Adding conditional control to text-to-image diffusion models. In ICCV. 3836-3847.","journal-title":"ICCV."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"crossref","unstructured":"Mengqi Zhang Yang Fu Zheng Ding Sifei Liu Zhuowen Tu and Xiaolong Wang. 2024. HOIDiffusion: Generating Realistic 3D Hand-Object Interaction Data. arXiv:2403.12011 [cs.CV] https:\/\/arxiv.org\/abs\/2403.12011","DOI":"10.1109\/CVPR52733.2024.00814"},{"key":"e_1_3_2_1_45_1","first-page":"586","article-title":"The unreasonable effectiveness of deep features as a perceptual metric","author":"Zhang Richard","year":"2018","unstructured":"Richard Zhang, Phillip Isola, Alexei A Efros, Eli Shechtman, and Oliver Wang. 2018a. The unreasonable effectiveness of deep features as a perceptual metric. In CVPR. 586-595.","journal-title":"CVPR."},{"key":"e_1_3_2_1_46_1","first-page":"586","article-title":"The unreasonable effectiveness of deep features as a perceptual metric","author":"Zhang Richard","year":"2018","unstructured":"Richard Zhang, Phillip Isola, Alexei A Efros, Eli Shechtman, and Oliver Wang. 2018b. The unreasonable effectiveness of deep features as a perceptual metric. In CVPR. 586-595.","journal-title":"CVPR."},{"key":"e_1_3_2_1_47_1","unstructured":"Yunpeng Zhang Qiang Wang Fan Jiang Yaqi Fan Mu Xu and Yonggang Qi. 2025. FantasyID: Face Knowledge Enhanced ID-Preserving Video Generation. arXiv:2502.13995 [cs.GR] https:\/\/arxiv.org\/abs\/2502.13995"},{"key":"e_1_3_2_1_48_1","volume-title":"TMM","author":"Zhou Jun","year":"2023","unstructured":"Jun Zhou, Chi Xu, Yuting Ge, and Li Cheng. 2023. Realistic Depth Image Synthesis for 3D Hand Pose Estimation. In TMM (2023)."},{"key":"e_1_3_2_1_49_1","first-page":"9251","volume-title":"NIPS","volume":"33","author":"Zhou Yi","year":"2020","unstructured":"Yi Zhou, Chenglei Wu, Zimo Li, Chen Cao, Yuting Ye, Jason Saragih, Hao Li, and Yaser Sheikh. 2020. Fully convolutional mesh autoencoder using efficient spatially varying kernels. In NIPS, Vol. 33 (2020), 9251-9262."},{"key":"e_1_3_2_1_50_1","first-page":"2223","article-title":"Unpaired image-to-image translation using cycle-consistent adversarial networks","author":"Zhu Jun-Yan","year":"2017","unstructured":"Jun-Yan Zhu, Taesung Park, Phillip Isola, and Alexei A Efros. 2017. Unpaired image-to-image translation using cycle-consistent adversarial networks. In ICCV. 2223-2232.","journal-title":"ICCV."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","sponsor":["SIGMM ACM Special Interest Group on Multimedia"],"location":"Dublin Ireland","acronym":"MM '25"},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755290","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:09:38Z","timestamp":1765339778000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755290"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":50,"alternative-id":["10.1145\/3746027.3755290","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755290","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}