{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T01:35:24Z","timestamp":1772674524243,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":58,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3681265","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"681-690","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Geometry-Guided Diffusion Model with Masked Transformer for Robust Multi-View 3D Human Pose Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-9915-5608","authenticated-orcid":false,"given":"Xinyi","family":"Zhang","sequence":"first","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6796-2112","authenticated-orcid":false,"given":"Qinpeng","family":"Cui","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9599-1844","authenticated-orcid":false,"given":"Qiqi","family":"Bao","sequence":"additional","affiliation":[{"name":"Zhejiang University of Science &amp; Technology, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2506-1286","authenticated-orcid":false,"given":"Wenming","family":"Yang","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7509-3964","authenticated-orcid":false,"given":"Qingmin","family":"Liao","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01075"},{"key":"e_1_3_2_1_2_1","volume-title":"Dis entangled Diffusion-Based 3D Human Pose Estimation with Hierarchical Spatial and Temporal Denoiser. arXiv preprint arXiv:2403.04444","author":"Cai Qingyuan","year":"2024","unstructured":"Qingyuan Cai, Xuecai Hu, Saihui Hou, Li Yao, and Yongzhen Huang. 2024. Dis entangled Diffusion-Based 3D Human Pose Estimation with Hierarchical Spatial and Temporal Denoiser. arXiv preprint arXiv:2403.04444 (2024)."},{"key":"e_1_3_2_1_3_1","volume-title":"Proceedings, Part I 16","author":"Cao Zhe","year":"2020","unstructured":"Zhe Cao, Hang Gao, Karttikeya Mangalam, Qi-Zhi Cai, Minh Vo, and Jitendra Malik. 2020. Long-termhumanmotionpredictionwithscenecontext.InComputer Vision--ECCV 2020: 16th European Conference, Glasgow, UK, August 23--28, 2020, Proceedings, Part I 16. Springer, 387--404."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00742"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00465"},{"key":"e_1_3_2_1_6_1","volume-title":"Elucidating the solution space of extended reverse-time SDE for diffusion models. arXiv preprint arXiv:2309.06169","author":"Cui Qinpeng","year":"2023","unstructured":"Qinpeng Cui, Xinyi Zhang, Zongqing Lu, and Qingmin Liao. 2023. Elucidating the solution space of extended reverse-time SDE for diffusion models. arXiv preprint arXiv:2309.06169 (2023)."},{"key":"e_1_3_2_1_7_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. 2021. Diffusion models beat gans on image synthesis. Advances in neural information processing systems 34 (2021), 8780--8794."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00398"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00979"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3611772"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01253"},{"key":"e_1_3_2_1_12_1","volume-title":"Multipleviewgeometryincomputer vision","unstructured":"RichardHartleyandAndrewZisserman.2003. Multipleviewgeometryincomputer vision. Cambridge university press."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00780"},{"key":"e_1_3_2_1_14_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems 33","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems 33 (2020), 6840--6851."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01464"},{"key":"e_1_3_2_1_16_1","volume-title":"6m: Large scale datasets and predictive methods for 3d human sensing in natural environments","author":"Ionescu Catalin","year":"2013","unstructured":"Catalin Ionescu, Dragos Papava, Vlad Olaru, and Cristian Sminchisescu. 2013. Human3. 6m: Large scale datasets and predictive methods for 3d human sensing in natural environments. IEEE transactions on pattern analysis and machine intelligence 36, 7 (2013), 1325--1339."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00781"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of the IEEE\/CVF International Conference on Computer Vision. 14850--14860","author":"Jiang Boyuan","year":"2023","unstructured":"Boyuan Jiang, Lei Hu, and Shihong Xia. 2023. Probabilistic Triangulation for Un calibrated Multi-View 3D Human Pose Estimation. In Proceedings of the IEEE\/CVF International Conference on Computer Vision. 14850--14860."},{"key":"e_1_3_2_1_19_1","volume-title":"Panoptic Studio: A Massively Multiview System for Social Motion Capture. In The IEEE International Conference on Computer Vision (ICCV).","author":"Joo Hanbyul","year":"2015","unstructured":"Hanbyul Joo, Hao Liu, Lei Tan, Lin Gui, Bart Nabbe, Iain Matthews, Takeo Kanade, Shohei Nobuhara, and Yaser Sheikh. 2015. Panoptic Studio: A Massively Multiview System for Social Motion Capture. In The IEEE International Conference on Computer Vision (ICCV)."},{"key":"e_1_3_2_1_20_1","volume-title":"Timothy Scott Godisart, Bart Nabbe, Iain Matthews, Takeo Kanade, Shohei Nobuhara, and Yaser Sheikh.","author":"Joo Hanbyul","year":"2017","unstructured":"Hanbyul Joo, Tomas Simon, Xulong Li, Hao Liu, Lei Tan, Lin Gui, Sean Baner jee, Timothy Scott Godisart, Bart Nabbe, Iain Matthews, Takeo Kanade, Shohei Nobuhara, and Yaser Sheikh. 2017. Panoptic Studio: A Massively Multiview System for Social Interaction Capture. IEEE Transactions on Pattern Analysis and Machine Intelligence (2017)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.336"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3141231"},{"key":"e_1_3_2_1_23_1","unstructured":"Wenhao Li Hong Liu Hao Tang Pichao Wang and Luc Van Gool. 2022. Mh former: Multi-hypothesis transformer for 3d human pose estimation. In Pro ceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 13147--13156."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612368"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3379337.3415822"},{"key":"e_1_3_2_1_26_1","volume-title":"Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101","author":"Loshchilov Ilya","year":"2017","unstructured":"Ilya Loshchilov and Frank Hutter. 2017. Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)."},{"key":"e_1_3_2_1_27_1","volume-title":"Transfusion: Cross-view fusion with transformer for 3d human pose estimation. arXiv preprint arXiv:2110.09554","author":"Ma Haoyu","year":"2021","unstructured":"Haoyu Ma, Liangjian Chen, Deying Kong, Zhe Wang, Xingwei Liu, Hao Tang, Xiangyi Yan, Yusheng Xie, Shih-Yao Lin, and Xiaohui Xie. 2021. Transfusion: Cross-view fusion with transformer for 3d human pose estimation. arXiv preprint arXiv:2110.09554 (2021)."},{"key":"e_1_3_2_1_28_1","volume-title":"European Conference on Computer Vision. Springer, 424--442","author":"Chen Liangjian","year":"2022","unstructured":"HaoyuMa,ZheWang,YifeiChen,DeyingKong,Liangjian Chen, Xingwei Liu, Xi angyi Yan, Hao Tang, and Xiaohui Xie. 2022. Ppt: token-pruned pose transformer for monocular and multi-view human pose estimation. In European Conference on Computer Vision. Springer, 424--442."},{"key":"e_1_3_2_1_29_1","volume-title":"Geometry-Biased Transformer for Robust Multi-View 3D Human Pose Reconstruction. arXiv preprint arXiv:2312.17106","author":"Moliner Olivier","year":"2023","unstructured":"Olivier Moliner, Sangxia Huang, and Kalle \u00c5str\u00f6m. 2023. Geometry-Biased Transformer for Robust Multi-View 3D Human Pose Reconstruction. arXiv preprint arXiv:2312.17106 (2023)."},{"key":"e_1_3_2_1_30_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Massa Francisco","year":"2019","unstructured":"AdamPaszke,SamGross,Francisco Massa, AdamLerer,JamesBradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00444"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00608"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01356"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3188716"},{"key":"e_1_3_2_1_35_1","volume-title":"Hand Keypoint Detection in Single Images using Multiview Bootstrapping. CVPR","author":"Simon Tomas","year":"2017","unstructured":"Tomas Simon, Hanbyul Joo, and Yaser Sheikh. 2017. Hand Keypoint Detection in Single Images using Multiview Bootstrapping. CVPR (2017)."},{"key":"e_1_3_2_1_36_1","volume-title":"International conference on machine learning. PMLR, 2256--2265","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International conference on machine learning. PMLR, 2256--2265."},{"key":"e_1_3_2_1_37_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_38_1","volume-title":"Generativemodelingbyestimatinggradients of the data distribution. Advances in neural information processing systems 32","year":"2019","unstructured":"YangSongandStefanoErmon.2019. Generativemodelingbyestimatinggradients of the data distribution. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"andBenPoole","author":"Song Yang","year":"2020","unstructured":"Yang Song, Jascha Sohl-Dickstein, Diederik P Kingma, Abhishek Kumar, Stefano Ermon, andBenPoole. 2020. Score-based generative modeling through stochastic differential equations. arXiv preprint arXiv:2011.13456 (2020)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612221"},{"key":"e_1_3_2_1_41_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 27th ACM international conference on multimedia. 374--382","author":"Wang Jianbo","year":"2019","unstructured":"Jianbo Wang, Kai Qiu, Houwen Peng, Jianlong Fu, and Jianke Zhu. 2019. Ai coach:Deephumanposeestimationandanalysisforpersonalizedathletictraining assistance. In Proceedings of the 27th ACM international conference on multimedia. 374--382."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00896"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612330"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01101"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00606"},{"key":"e_1_3_2_1_47_1","volume-title":"Proceedings of the 25th International Conference on Intelligent User Interfaces. 88--99","author":"Shin Hijung Valentina","year":"2020","unstructured":"NoraSWillett, Hijung Valentina Shin, Zeyu Jin, Wilmot Li, and Adam Finkelstein. 2020. Pose2Pose: Pose selection and transfer for 2D character animation. In Proceedings of the 25th International Conference on Intelligent User Interfaces. 88--99."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01122"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"e_1_3_2_1_50_1","article-title":"Learning physically simulated tennis skills from broadcast videos","volume":"42","author":"Yuan Y","year":"2023","unstructured":"Y Yuan, Viktor Makoviychuk, Y Guo, S Fidler, XB Peng, and K Fatahalian. 2023. Learning physically simulated tennis skills from broadcast videos. ACM Trans. Graph 42, 4 (2023).","journal-title":"ACM Trans. Graph"},{"key":"e_1_3_2_1_51_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 27--36","unstructured":"SophyaniBanaamwiniYussif,NingXie,YangYang,andHengTaoShen.2023. Self Relational Graph Convolution Network for Skeleton-Based Action Recognition. In Proceedings of the 31st ACM International Conference on Multimedia. 27--36."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3478513.3480500"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612532"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01398-9"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01979"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603618"},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 31st ACM International Conference on Multimedia. 7512--7520. [59] Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, and Jiaqi Wang.","author":"Zhou Kangkang","year":"2023","unstructured":"Kangkang Zhou, Lijun Zhang, Feng Lu, Xiang-Dong Zhou, and Yu Shi. 2023. Efficient Hierarchical Multi-view Fusion Transformer for 3D Human Pose Esti mation. In Proceedings of the 31st ACM International Conference on Multimedia. 7512--7520. [59] Yujie Zhou, Wenwen Qiang, Anyi Rao, Ning Lin, Bing Su, and Jiaqi Wang. 2023. Zero-shot skeleton-based action recognition via mutual information estimation and maximization. In Proceedings of the 31st ACM International Conference on Multimedia. 5302--5310"}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681265","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3681265","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:42Z","timestamp":1750295862000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3681265"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":58,"alternative-id":["10.1145\/3664647.3681265","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3681265","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}