{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T08:06:12Z","timestamp":1773993972722,"version":"3.50.1"},"reference-count":55,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,7,1]],"date-time":"2026-07-01T00:00:00Z","timestamp":1782864000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62373009"],"award-info":[{"award-number":["62373009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Pattern Recognition"],"published-print":{"date-parts":[[2026,7]]},"DOI":"10.1016\/j.patcog.2026.113072","type":"journal-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T07:54:59Z","timestamp":1768290899000},"page":"113072","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["DSVTformer: Dual-stream spatial-view-temporal transformer for multi-view 3D human pose estimation"],"prefix":"10.1016","volume":"175","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-4343-7360","authenticated-orcid":false,"given":"Wanruo","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6332-8316","authenticated-orcid":false,"given":"Mengyuan","family":"Liu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8048-2668","authenticated-orcid":false,"given":"Wenhao","family":"Li","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7498-6541","authenticated-orcid":false,"given":"Hong","family":"Liu","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/j.patcog.2026.113072_bib0001","series-title":"CVPR","first-page":"7307","article-title":"Ordinal depth supervision for 3D human pose estimation","author":"Pavlakos","year":"2018"},{"key":"10.1016\/j.patcog.2026.113072_bib0002","series-title":"ICCV","first-page":"2602","article-title":"Compositional human pose regression","author":"Sun","year":"2017"},{"key":"10.1016\/j.patcog.2026.113072_bib0003","article-title":"DGFormer: Dynamic graph transformer for 3D human pose estimation","volume":"152","author":"Chen","year":"2024","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0004","series-title":"ECCV","first-page":"19","article-title":"UPose3D: uncertainty-aware 3D human pose estimation with cross-view and temporal cues","author":"Davoodnia","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0005","series-title":"CVPR","first-page":"20448","article-title":"Uncertainty-aware adaptation for self-supervised 3d human pose estimation","author":"Kundu","year":"2022"},{"key":"10.1016\/j.patcog.2026.113072_bib0006","series-title":"AAAI","first-page":"882","article-title":"Disentangled diffusion-based 3d human pose estimation with hierarchical spatial and temporal denoiser","volume":"vol. 38","author":"Cai","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0007","series-title":"ACM MM","first-page":"681","article-title":"Geometry-guided diffusion model with masked transformer for robust multi-view 3D human pose estimation","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0008","series-title":"ACM MM","first-page":"7512","article-title":"Efficient hierarchical multi-view fusion transformer for 3D human pose estimation","author":"Zhou","year":"2023"},{"key":"10.1016\/j.patcog.2026.113072_bib0009","series-title":"AAAI","first-page":"7205","article-title":"Deep semantic graph transformer for multi-view 3D human pose estimation","volume":"vol. 38","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0010","article-title":"ESMformer: error-aware self-supervised transformer for multi-view 3D human pose estimation","volume":"158","author":"Zhang","year":"2025","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0011","series-title":"NeurIPS","first-page":"27394","article-title":"A single 2D pose with context is worth hundreds for 3d human pose estimation","volume":"vol. 36","author":"Zhao","year":"2023"},{"key":"10.1016\/j.patcog.2026.113072_bib0012","series-title":"ACM MM","first-page":"1672","article-title":"APP: adaptive pose pooling for 3D human pose estimation from videos","author":"Zhang","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0013","first-page":"1282","article-title":"Exploiting temporal contexts with strided transformer for 3d human pose estimation","volume":"25","author":"Li","year":"2022","journal-title":"IEEE TMM"},{"key":"10.1016\/j.patcog.2026.113072_bib0014","series-title":"CVPR","first-page":"13232","article-title":"Mixste: seq2seq mixed spatio-temporal encoder for 3D human pose estimation in video","author":"Zhang","year":"2022"},{"key":"10.1016\/j.patcog.2026.113072_bib0015","article-title":"Kinematics-aware spatial-temporal feature transform for 3D human pose estimation","volume":"150","author":"Du","year":"2024","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0016","series-title":"CVPR","first-page":"1123","article-title":"KTPFormer: kinematics and trajectory prior knowledge-enhanced transformer for 3D human pose estimation","author":"Peng","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0017","series-title":"CVPR","first-page":"1077","article-title":"Self-supervised learning of 3D human pose using multi-view geometry","author":"Kocabas","year":"2019"},{"key":"10.1016\/j.patcog.2026.113072_bib0018","series-title":"CVPR","first-page":"6040","article-title":"Lightweight multi-view 3D pose estimation through camera-disentangled representation","author":"Remelli","year":"2020"},{"key":"10.1016\/j.patcog.2026.113072_bib0019","series-title":"CVPR","first-page":"8437","article-title":"Learning monocular 3D human pose estimation from multi-view images","author":"Rhodin","year":"2018"},{"key":"10.1016\/j.patcog.2026.113072_bib0020","series-title":"CVPR","first-page":"3618","article-title":"3D Pictorial structures for multiple view articulated pose estimation","author":"Burenius","year":"2013"},{"key":"10.1016\/j.patcog.2026.113072_bib0021","series-title":"CVPR","first-page":"6988","article-title":"Harvesting multiple views for marker-less 3d human pose annotations","author":"Pavlakos","year":"2017"},{"key":"10.1016\/j.patcog.2026.113072_bib0022","series-title":"ICCV","first-page":"4342","article-title":"Cross view fusion for 3D human pose estimation","author":"Qiu","year":"2019"},{"key":"10.1016\/j.patcog.2026.113072_bib0023","series-title":"CVPR","first-page":"7779","article-title":"Epipolar transformers","author":"He","year":"2020"},{"key":"10.1016\/j.patcog.2026.113072_bib0024","doi-asserted-by":"crossref","DOI":"10.15607\/RSS.2021.XVII.040","article-title":"Real-time multi-view 3D human pose estimation using semantic feedback to smart edge sensors","author":"Bultmann","year":"2021","journal-title":"RSS"},{"key":"10.1016\/j.patcog.2026.113072_bib0025","series-title":"ICCV","first-page":"7718","article-title":"Learnable triangulation of human pose","author":"Iskakov","year":"2019"},{"key":"10.1016\/j.patcog.2026.113072_bib0026","series-title":"CVPR","first-page":"11028","article-title":"Generalizable human pose triangulation","author":"Bartol","year":"2022"},{"issue":"3","key":"10.1016\/j.patcog.2026.113072_bib0027","doi-asserted-by":"crossref","first-page":"869","DOI":"10.1007\/s11263-021-01570-9","article-title":"Consensus-based optimization for 3D human pose estimation in camera coordinates","volume":"130","author":"Luvizon","year":"2022","journal-title":"IJCV"},{"key":"10.1016\/j.patcog.2026.113072_bib0028","series-title":"ECCV","first-page":"176","article-title":"FLEX: extrinsic parameters-free multi-view 3D human motion reconstruction","author":"Gordon","year":"2022"},{"key":"10.1016\/j.patcog.2026.113072_bib0029","series-title":"WACV","first-page":"429","article-title":"DeepFuse: an IMU-aware network for real-time 3D human pose estimation from multi-view image","author":"Huang","year":"2020"},{"issue":"4","key":"10.1016\/j.patcog.2026.113072_bib0030","doi-asserted-by":"crossref","first-page":"4122","DOI":"10.1109\/TPAMI.2022.3188716","article-title":"Adaptive multi-view and temporal fusing transformer for 3D human pose estimation","volume":"45","author":"Shuai","year":"2022","journal-title":"IEEE TPAMI"},{"key":"10.1016\/j.patcog.2026.113072_bib0031","series-title":"WACV","first-page":"6920","article-title":"MotionAGFormer: enhancing 3D human pose estimation with a transformer-GCNFormer network","author":"Mehraban","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0032","article-title":"Multi-hop graph transformer network for 3D human pose estimation","volume":"101","author":"Islam","year":"2024","journal-title":"JVCIR"},{"key":"10.1016\/j.patcog.2026.113072_bib0033","first-page":"1","article-title":"ST-trans: spatial-temporal transformer for infrared small target detection in sequential images","volume":"62","author":"Tong","year":"2024","journal-title":"TGRS"},{"issue":"9","key":"10.1016\/j.patcog.2026.113072_bib0034","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3679017","article-title":"Spatio-temporal parallel transformer based model for traffic prediction","volume":"18","author":"Kumar","year":"2024","journal-title":"TKDD"},{"issue":"10","key":"10.1016\/j.patcog.2026.113072_bib0035","first-page":"1","article-title":"Transformer-Based and structure-aware dual-stream network for low-light image enhancement","volume":"21","author":"Zhou","year":"2025","journal-title":"TOMM"},{"key":"10.1016\/j.patcog.2026.113072_bib0036","article-title":"Enhancing outdoor vision: binocular desnowing with dual-stream temporal transformer","volume":"170","author":"Yu","year":"2026","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0037","series-title":"ICCV","first-page":"3941","article-title":"Learning to fuse 2d and 3d image cues for monocular body pose estimation","author":"Tekin","year":"2017"},{"key":"10.1016\/j.patcog.2026.113072_bib0038","series-title":"CVPR","first-page":"3425","article-title":"Semantic graph convolutional networks for 3d human pose regression","author":"Zhao","year":"2019"},{"key":"10.1016\/j.patcog.2026.113072_bib0039","article-title":"Single image based 3D human pose estimation via uncertainty learning","volume":"132","author":"Han","year":"2022","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0040","series-title":"CVPR","first-page":"1","article-title":"YOLOv3: an incremental improvement","volume":"vol. 1804","author":"Farhadi","year":"2018"},{"key":"10.1016\/j.patcog.2026.113072_bib0041","series-title":"CVPR","first-page":"7103","article-title":"Cascaded pyramid network for multi-person pose estimation","author":"Chen","year":"2018"},{"issue":"7","key":"10.1016\/j.patcog.2026.113072_bib0042","doi-asserted-by":"crossref","first-page":"1325","DOI":"10.1109\/TPAMI.2013.248","article-title":"Human3. 6m: large scale datasets and predictive methods for 3D human sensing in natural environments","volume":"36","author":"Ionescu","year":"2013","journal-title":"IEEE TPAMI"},{"key":"10.1016\/j.patcog.2026.113072_bib0043","series-title":"3DV","first-page":"506","article-title":"Monocular 3D human pose estimation in the wild using improved CNN supervision","author":"Mehta","year":"2017"},{"issue":"8","key":"10.1016\/j.patcog.2026.113072_bib0044","doi-asserted-by":"crossref","first-page":"671","DOI":"10.3390\/rs8080671","article-title":"Three-dimensional body and centre of mass kinematics in alpine ski racing using differential GNSS and inertial sensors","volume":"8","author":"Fasel","year":"2016","journal-title":"Remote Sens."},{"key":"10.1016\/j.patcog.2026.113072_bib0045","unstructured":"D.P. Kingma, Adam: a method for stochastic optimization,(2014) arXiv: 1412.6980."},{"issue":"1","key":"10.1016\/j.patcog.2026.113072_bib0046","first-page":"198","article-title":"Anatomy-aware 3D human pose estimation with bone-based pose decomposition","volume":"32","author":"Chen","year":"2021","journal-title":"IEEE TCSVT"},{"key":"10.1016\/j.patcog.2026.113072_bib0047","series-title":"CVPR","first-page":"4800","article-title":"GFPose: learning 3D human pose prior with gradient fields","author":"Ci","year":"2023"},{"key":"10.1016\/j.patcog.2026.113072_bib0048","article-title":"Multi-hypothesis representation learning for transformer-based 3D human pose estimation","volume":"141","author":"Li","year":"2023","journal-title":"PR"},{"key":"10.1016\/j.patcog.2026.113072_bib0049","series-title":"AAAI","first-page":"7632","article-title":"Lifting by image\u2013leveraging image cues for accurate 3d human pose estimation","volume":"vol. 38","author":"Zhou","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0050","series-title":"NeurIPS","article-title":"A single 2D pose with context is worth hundreds for 3D human pose estimation","volume":"vol. 36","author":"Zhao","year":"2024"},{"key":"10.1016\/j.patcog.2026.113072_bib0051","series-title":"CVPR","first-page":"13294","article-title":"CanonPose: self-supervised monocular 3D human pose estimation in the wild","author":"Wandt","year":"2021"},{"key":"10.1016\/j.patcog.2026.113072_bib0052","doi-asserted-by":"crossref","first-page":"97","DOI":"10.1016\/j.neucom.2022.02.076","article-title":"Self-supervised 3D human pose estimation from video","volume":"488","author":"Gholami","year":"2022","journal-title":"Neurocomputing"},{"key":"10.1016\/j.patcog.2026.113072_bib0053","series-title":"ECCV","first-page":"483","article-title":"Stacked hourglass networks for human pose estimation","author":"Newell","year":"2016"},{"key":"10.1016\/j.patcog.2026.113072_bib0054","series-title":"CVPR","first-page":"5693","article-title":"Deep high-resolution representation learning for human pose estimation","author":"Sun","year":"2019"},{"issue":"11","key":"10.1016\/j.patcog.2026.113072_bib0055","article-title":"Visualizing data using t-SNE","volume":"9","author":"Van der Maaten","year":"2008","journal-title":"JMLR"}],"container-title":["Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600035X?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S003132032600035X?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,3,20]],"date-time":"2026-03-20T05:17:11Z","timestamp":1773983831000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S003132032600035X"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,7]]},"references-count":55,"alternative-id":["S003132032600035X"],"URL":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113072","relation":{},"ISSN":["0031-3203"],"issn-type":[{"value":"0031-3203","type":"print"}],"subject":[],"published":{"date-parts":[[2026,7]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"DSVTformer: Dual-stream spatial-view-temporal transformer for multi-view 3D human pose estimation","name":"articletitle","label":"Article Title"},{"value":"Pattern Recognition","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.patcog.2026.113072","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"113072"}}