{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T23:52:49Z","timestamp":1780444369992,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,28]],"date-time":"2024-10-28T00:00:00Z","timestamp":1730073600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 62203476"],"award-info":[{"award-number":["No. 62203476"]}],"id":[{"id":"10.13039\/https:\/\/doi.org\/10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen Innovation in Science and Technology Foundation for The Excellent Youth Scholars","award":["No. RCYX20231211090248064"],"award-info":[{"award-number":["No. RCYX20231211090248064"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,28]]},"DOI":"10.1145\/3664647.3680880","type":"proceedings-article","created":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T06:59:33Z","timestamp":1729925973000},"page":"1672-1681","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["APP: Adaptive Pose Pooling for 3D Human Pose Estimation from Videos"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-3013-5914","authenticated-orcid":false,"given":"Jinyan","family":"Zhang","sequence":"first","affiliation":[{"name":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4036-9315","authenticated-orcid":false,"given":"Mengyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7498-6541","authenticated-orcid":false,"given":"Hong","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2401-7435","authenticated-orcid":false,"given":"Guoquan","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8048-2668","authenticated-orcid":false,"given":"Wenhao","family":"Li","sequence":"additional","affiliation":[{"name":"State Key Laboratory of General Artificial Intelligence, Peking University, Shenzhen Graduate School, Shenzhen, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2024,10,28]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/IV55152.2023.10186575"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00539"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00236"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/65"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00742"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV56688.2023.00287"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00235"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_41"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.89"},{"key":"e_1_3_2_1_10_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly Jakob Uszkoreit and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In ICLR."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01251"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01080"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIPR.2019.00036"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00780"},{"key":"e_1_3_2_1_15_1","unstructured":"Dan Hendrycks and Kevin Gimpel. 2016. Bridging Nonlinearities and Stochastic Regularizers with Gaussian Error Linear Units. (2016)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00781"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00744"},{"key":"e_1_3_2_1_19_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_20_1","volume-title":"International Conference on Learning Representations.","author":"Larsson Gustav","year":"2016","unstructured":"Gustav Larsson, Michael Maire, and Gregory Shakhnarovich. 2016. FractalNet: Ultra-Deep Neural Networks without Residuals. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_21_1","first-page":"1282","article-title":"Exploiting Temporal Contexts with Strided Transformer for 3D Human Pose Estimation","volume":"25","author":"Li Wenhao","year":"2022","unstructured":"Wenhao Li, Hong Liu, Runwei Ding, Mengyuan Liu, Pichao Wang, and Wenming Yang. 2022. Exploiting Temporal Contexts with Strided Transformer for 3D Human Pose Estimation. IEEE TMM, Vol. 25 (2022), 1282--1293.","journal-title":"IEEE TMM"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01280"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00064"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-17688-3_31"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00511"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.288"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00677"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Dushyant Mehta Helge Rhodin Dan Casas Pascal Fua Oleksandr Sotnychenko Weipeng Xu and Christian Theobalt. 2017. Monocular 3D human pose estimation in the wild using improved CNN supervision. In 3DV. 506--516.","DOI":"10.1109\/3DV.2017.00064"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3072959.3073596"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46484-8_29"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.139"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00794"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00113"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01494"},{"key":"e_1_3_2_1_35_1","volume-title":"YOLOv3: An Incremental Improvement. arXiv","author":"Redmon Joseph","year":"2018","unstructured":"Joseph Redmon and Ali Farhadi. 2018. YOLOv3: An Incremental Improvement. arXiv (2018)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_46"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_27"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01356"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_33"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00464"},{"key":"e_1_3_2_1_42_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is All you Need. In NeurIPS. 5998--6008."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00098"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00810"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01288"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01398-9"},{"key":"e_1_3_2_1_47_1","volume-title":"Thirty-seventh Conference on Neural Information Processing Systems.","author":"Zhao Qitao","year":"2023","unstructured":"Qitao Zhao, Ce Zheng, Mengyuan Liu, and Chen Chen. 2023. A Single 2D Pose with Context is Worth Hundreds for 3D Human Pose Estimation. In Thirty-seventh Conference on Neural Information Processing Systems."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00857"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01145"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00243"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01385"},{"key":"e_1_3_2_1_52_1","volume-title":"Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations.","author":"Zhu Xizhou","year":"2020","unstructured":"Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, and Jifeng Dai. 2020. Deformable DETR: Deformable Transformers for End-to-End Object Detection. In International Conference on Learning Representations."}],"event":{"name":"MM '24: The 32nd ACM International Conference on Multimedia","location":"Melbourne VIC Australia","acronym":"MM '24","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 32nd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680880","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3664647.3680880","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:17:33Z","timestamp":1750295853000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3664647.3680880"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,28]]},"references-count":52,"alternative-id":["10.1145\/3664647.3680880","10.1145\/3664647"],"URL":"https:\/\/doi.org\/10.1145\/3664647.3680880","relation":{},"subject":[],"published":{"date-parts":[[2024,10,28]]},"assertion":[{"value":"2024-10-28","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}