{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,21]],"date-time":"2026-03-21T05:20:40Z","timestamp":1774070440878,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,26]],"date-time":"2023-10-26T00:00:00Z","timestamp":1698278400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc\/4.0\/"}],"funder":[{"name":"National Natural Science Foundation of China","award":["62106247"],"award-info":[{"award-number":["62106247"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,26]]},"DOI":"10.1145\/3581783.3612098","type":"proceedings-article","created":{"date-parts":[[2023,10,27]],"date-time":"2023-10-27T19:52:20Z","timestamp":1698436340000},"page":"7512-7520","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Efficient Hierarchical Multi-view Fusion Transformer for 3D Human Pose Estimation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8624-414X","authenticated-orcid":false,"given":"Kangkang","family":"Zhou","sequence":"first","affiliation":[{"name":"Chongqing Institute of Green and Intelligent Technology, Chinese Academy of Sciences &amp; Chongqing School, University of Chinese Academy of Sciences, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9453-4032","authenticated-orcid":false,"given":"Lijun","family":"Zhang","sequence":"additional","affiliation":[{"name":"Chongqing Institute of Green and Intelligent Technology, Chinese Academy of Sciences &amp; Chongqing School, University of Chinese Academy of Sciences, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5236-2287","authenticated-orcid":false,"given":"Feng","family":"Lu","sequence":"additional","affiliation":[{"name":"Tsinghua Shenzhen International Graduate School, Tsinghua University &amp; Peng Cheng Laboratory, shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4451-5327","authenticated-orcid":false,"given":"Xiang-Dong","family":"Zhou","sequence":"additional","affiliation":[{"name":"Chongqing Institute of Green and Intelligent Technology, Chinese Academy of Sciences; Chongqing school, University of Chinese Academy of Sciences, Chongqing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9117-8282","authenticated-orcid":false,"given":"Yu","family":"Shi","sequence":"additional","affiliation":[{"name":"Chongqing Institute of Green and Intelligent Technology, Chinese Academy of Sciences; Chongqing school, University of Chinese Academy of Sciences, Chongqing, China"}]}],"member":"320","published-online":{"date-parts":[[2023,10,27]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"crossref","unstructured":"Kristijan Bartol David Bojani\u0107 Tomislav Petkovi\u0107 and Tomislav Pribani?. 2022. Generalizable Human Pose Triangulation. In CVPR. 11028--11037.","DOI":"10.1109\/CVPR52688.2022.01075"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"crossref","unstructured":"Arij Bouazizi Julian Wiederer Ulrich Kressel and Vasileios Belagiannis. 2021. Self-Supervised 3D Human Pose Estimation with Multiple-View Geometry. In FG.","DOI":"10.1109\/FG52635.2021.9667074"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"crossref","unstructured":"Simon Bultmann and Sven Behnke. 2021. Real-Time Multi-View 3D Human Pose Estimation using Semantic Feedback to Smart Edge Sensors. In RSS.","DOI":"10.15607\/RSS.2021.XVII.040"},{"key":"e_1_3_2_2_4_1","volume-title":"Junsong Yuan, and Nadia Magnenat Thalmann.","author":"Cai Yujun","year":"2019","unstructured":"Yujun Cai, Liuhao Ge, Jun Liu, Jianfei Cai, Tat Jen Cham, Junsong Yuan, and Nadia Magnenat Thalmann. 2019. Exploiting spatial-temporal relationships for 3d pose estimation via graph convolutional networks. In ICCV. 2272--2281."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"crossref","unstructured":"Inho Chang Min-Gyu Park Jaewoo Kim and Ju Hong Yoon. 2021. Multi-View 3D Human Pose Estimation with Self-Supervised Learning. In ICAIIC.","DOI":"10.1109\/ICAIIC51459.2021.9415244"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3057267"},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"Xipeng Chen Pengxu Wei and Liang Lin. 2021. Deductive Learning for Weakly-Supervised 3D Human Pose Estimation via Uncalibrated Cameras. In AAAI. 1089--1096.","DOI":"10.1609\/aaai.v35i2.16194"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"crossref","unstructured":"Yilun Chen Zhicheng Wang Yuxiang Peng Zhiqiang Zhang Gang Yu and Jian Sun. 2018. Cascaded pyramid network for multi-person pose estimation. In CVPR. 7103--7112.","DOI":"10.1109\/CVPR.2018.00742"},{"key":"e_1_3_2_2_9_1","first-page":"617","article-title":"Three-dimensional body and centre of mass kinematics in alpine ski racing using differential gnss and inertial sensors","volume":"8","author":"Fasel Benedikt","year":"2016","unstructured":"Benedikt Fasel, J\u00f6rg Sp\u00f6rri, Matthias Gilgien, Geo Boffi, Julien Chardonnens, Erich M\u00fcller, and Kamiar Aminian. 2016. Three-dimensional body and centre of mass kinematics in alpine ski racing using differential gnss and inertial sensors. RS 8, 8 (2016), 617.","journal-title":"RS"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2022.02.076"},{"key":"e_1_3_2_2_11_1","volume-title":"Flex: Parameter-free multi-view 3d human motion reconstruction. In ECCV.","author":"Gordon Brian","year":"2022","unstructured":"Brian Gordon, Sigal Raab, Guy Azov, Raja Giryes, and Daniel Cohen-Or. 2022. Flex: Parameter-free multi-view 3d human motion reconstruction. In ECCV."},{"key":"e_1_3_2_2_12_1","volume-title":"Yu","author":"He Yihui","year":"2020","unstructured":"Yihui He, Rui Yan, Katerina Fragkiadaki, and Shoou I. Yu. 2020. Epipolar trans-formers. In CVPR. 7779--7788."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"Fuyang Huang Ailing Zeng Minhao Liu and Qiuxia Lai. 2020. DeepFuse: An IMU-Aware Network for Real-Time 3D Human Pose Estimation from Multi-View Image. In WACV. 429--438.","DOI":"10.1109\/WACV45572.2020.9093526"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"crossref","unstructured":"Umar Iqbal Pavlo Molchanov and Jan Kautz. 2020. Weakly-supervised 3d human pose learning via multi-view images in the wild. In CVPR. 5242--5251.","DOI":"10.1109\/CVPR42600.2020.00529"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","unstructured":"Karim Iskakov Egor Burkov Victor Lempitsky and Yury Malkov. 2019. Learnable triangulation of human pose. In ICCV. 7717--7726.","DOI":"10.1109\/ICCV.2019.00781"},{"key":"e_1_3_2_2_17_1","volume-title":"A generalizable approach for multi-view 3D human pose regression. Machine Vision and Applications 32, 6","author":"Kadkhodamohammadi Abdolrahim","year":"2021","unstructured":"Abdolrahim Kadkhodamohammadi and Nicolas Padoy. 2021. A generalizable approach for multi-view 3D human pose regression. Machine Vision and Applications 32, 6 (2021)."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"crossref","unstructured":"Muhammed Kocabas Salih Karagoz and Emre Akbas. 2019. Self-supervised learning of 3d human pose using multi-view geometry. In CVPR. 1077--1086.","DOI":"10.1109\/CVPR.2019.00117"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2022.3141231"},{"key":"e_1_3_2_2_20_1","volume-title":"Mh-former: Multi-hypothesis transformer for 3d human pose estimation. In CVPR. 13147--13156.","author":"Li Wenhao","year":"2022","unstructured":"Wenhao Li, Hong Liu, Hao Tang, Pichao Wang, and Luc Van Gool. 2022. Mh-former: Multi-hypothesis transformer for 3d human pose estimation. In CVPR. 13147--13156."},{"key":"e_1_3_2_2_21_1","volume-title":"Sen ching Cheung, and Vijayan Asari","author":"Liu Ruixu","year":"2020","unstructured":"Ruixu Liu, Ju Shen, He Wang, Chen Chen, Sen ching Cheung, and Vijayan Asari. 2020. Attention mechanism exploits temporal contexts: Real-time 3d human pose reconstruction. In CVPR. 5064--5073."},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"crossref","unstructured":"Diogo C. Luvizon David Picard and Hedi Tabia. 2018. 2D\/3D pose estimation and action recognition using multitask deep learning. In CVPR. 5137--5146.","DOI":"10.1109\/CVPR.2018.00539"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-021-01570-9"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.cag.2019.09.002"},{"key":"e_1_3_2_2_25_1","volume-title":"Transfusion: Cross-view fusion with transformer for 3d human pose estimation. In BMVC.","author":"Ma Haoyu","year":"2021","unstructured":"Haoyu Ma, Liangjian Chen, Deying Kong, Zhe Wang, Xingwei Liu, Hao Tang, Xiangyi Yan, Yusheng Xie, Shih-Yao Lin, and Xiaohui Xie. 2021. Transfusion: Cross-view fusion with transformer for 3d human pose estimation. In BMVC."},{"key":"e_1_3_2_2_26_1","doi-asserted-by":"crossref","unstructured":"Dushyant Mehta Helge Rhodin Dan Casas Pascal Fua Oleksandr Sotnychenko Weipeng Xu and Christian Theobalt. 2017. Monocular 3D human pose estimation in the wild using im-proved cnn supervision. In 3DV.","DOI":"10.1109\/3DV.2017.00064"},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"crossref","unstructured":"Rahul Mitra Nitesh B. Gundavarapu Abhishek Sharma and Arjun Jain. 2020. Multiview-Consistent Semi-Supervised Learning for 3D Human Pose Estimation. In CVPR. 6907--6916.","DOI":"10.1109\/CVPR42600.2020.00694"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"crossref","unstructured":"Gyeongsik Moon and Kyoung Mu Lee. 2020. I2l-meshnet: Image-to-lixel prediction network for accurate 3d human pose and mesh estimation from a single rgb image. In ECCV. 752--768.","DOI":"10.1007\/978-3-030-58571-6_44"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"crossref","unstructured":"Georgios Pavlakos Xiaowei Zhou Konstantinos G. Derpanis and Kostas Daniilidis. 2017. Coarse-to-fine volumetric prediction for single-image 3d human pose. In CVPR. 1263--1272.","DOI":"10.1109\/CVPR.2017.139"},{"key":"e_1_3_2_2_30_1","doi-asserted-by":"crossref","unstructured":"Haibo Qiu Chunyu Wang Jingdong Wang Naiyan Wang and Wenjun Zeng. 2019. Cross View Fusion for 3D Human Pose Estimation. In ICCV. 4342--4351.","DOI":"10.1109\/ICCV.2019.00444"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","unstructured":"Edoardo Remelli Shangchen Han Sina Honari Pascal Fua and Robert Wang. 2020. Lightweight multi-view 3d pose estimation through camera-disentangled representation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00608"},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"crossref","unstructured":"Helge Rhodin Fr\u00e9d\u00e9ric Meyer J\u00f6rg Sp\u00f6rri Erich M\u00fcller Victor Constantin Pascal Fua Isinsu Katircioglu and Mathieu Salzmann. 2018. Learning Monocular 3D Human Pose Estimation From Multi-View Images. In CVPR. 8437--8446.","DOI":"10.1109\/CVPR.2018.00880"},{"key":"e_1_3_2_2_33_1","first-page":"1","article-title":"Adaptive Multi-view and Temporal Fusing Transformer for 3D Human Pose Estimation","volume":"14","author":"Shuai Hui","year":"2022","unstructured":"Hui Shuai, Lele Wu, and Qingshan Liu. 2022. Adaptive Multi-view and Temporal Fusing Transformer for 3D Human Pose Estimation. TPAMI 14, 8 (2022), 1--14.","journal-title":"TPAMI"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"crossref","unstructured":"Denis Tome Matteo Toso Lourdes Agapito and Chris Russell. 2018. Rethinking pose in 3D: Multi-stage refinement and recovery for markerless motion capture. In 3DV. 474--483.","DOI":"10.1109\/3DV.2018.00061"},{"key":"e_1_3_2_2_35_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. In NeurIPS."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"crossref","unstructured":"Bastian Wandt Marco Rudolph Petrissa Zell Helge Rhodin and Bodo Rosenhahn. 2021. CanonPose: Self-Supervised Monocular 3D Human Pose Estimation in the Wild. In CVPR. 13294--13304.","DOI":"10.1109\/CVPR46437.2021.01309"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01398-9"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"crossref","unstructured":"Jingbo Wang Sijie Yan Yuanjun Xiong and Dahua Lin. 2020. Motion guided 3d pose estimation from videos. In ECCV. 764--780.","DOI":"10.1007\/978-3-030-58601-0_45"},{"key":"e_1_3_2_2_39_1","volume-title":"Metafuse: A pre-trained fusion model for human pose estimation. In CVPR. 13686--13695.","author":"Xie Rongchang","year":"2020","unstructured":"Rongchang Xie, Chunyu Wang, and Yizhou Wang. 2020. Metafuse: A pre-trained fusion model for human pose estimation. In CVPR. 13686--13695."},{"key":"e_1_3_2_2_40_1","unstructured":"Tianhan Xu and Wataru Takano. 2021. Graph Stacked Hourglass Networks for 3D Human Pose Estimation. In CVPR. 16105--16114."},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"crossref","unstructured":"Ailing Zeng Xiao Sun Lei Yang Nanxuan Zhao Minhao Liu and Qiang Xu. 2021. Learning skeletal graph neural networks for hard 3d pose estimation. In ICCV. 11436--11445.","DOI":"10.1109\/ICCV48922.2021.01124"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"crossref","unstructured":"Jinlu Zhang Zhigang Tu Jianyu Yang Yujin Chen and Junsong Yuan. 2022. MixSTE: Seq2seq Mixed Spatio-Temporal Encoder for 3D Human Pose Estimation in Video. In CVPR. 13232--13242.","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"crossref","unstructured":"Yuxiang Zhang Liang An Tao Yu Xiu Li Kun Li and Yebin Liu. 2020. 4d association graph for realtime multi-person motion capture using multiple video cameras. In CVPR. 1321--1330.","DOI":"10.1109\/CVPR42600.2020.00140"},{"key":"e_1_3_2_2_44_1","volume-title":"Metaxas","author":"Zhao Long","year":"2019","unstructured":"Long Zhao, Xi Peng, Yu Tian, Mubbasir Kapadia, and Dimitris N. Metaxas. 2019. Semantic graph convolutional networks for 3D human pose regression. In ICCV. 3425--3435."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Weixi Zhao Weiqiang Wang and Yunjie Tian. 2022. GraFormer: Graph-Oriented Transformer for 3D Pose Estimation. In CVPR. 20438--20447.","DOI":"10.1109\/CVPR52688.2022.01979"},{"key":"e_1_3_2_2_46_1","doi-asserted-by":"crossref","unstructured":"Ce Zheng Sijie Zhu Matias Mendieta Taojiannan Yang Chen Chen and Zhengming Ding. 2021. 3d human pose estimation with spatial and temporal transformers. In CVPR. 11656--11665.","DOI":"10.1109\/ICCV48922.2021.01145"}],"event":{"name":"MM '23: The 31st ACM International Conference on Multimedia","location":"Ottawa ON Canada","acronym":"MM '23","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 31st ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612098","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3581783.3612098","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T00:05:21Z","timestamp":1755821121000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3581783.3612098"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,26]]},"references-count":46,"alternative-id":["10.1145\/3581783.3612098","10.1145\/3581783"],"URL":"https:\/\/doi.org\/10.1145\/3581783.3612098","relation":{},"subject":[],"published":{"date-parts":[[2023,10,26]]},"assertion":[{"value":"2023-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}