{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,7]],"date-time":"2026-04-07T16:21:39Z","timestamp":1775578899774,"version":"3.50.1"},"reference-count":141,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"7","license":[{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T00:00:00Z","timestamp":1751328000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"JC STEM Lab of AI for Science and Engineering"},{"name":"Hong Kong Jockey Club Charities Trust, the Research Grants Council"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1109\/tpami.2025.3552604","type":"journal-article","created":{"date-parts":[[2025,3,18]],"date-time":"2025-03-18T17:36:21Z","timestamp":1742319381000},"page":"5672-5689","source":"Crossref","is-referenced-by-count":4,"title":["Hulk: A Universal Knowledge Translator for Human-Centric Tasks"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6819-5872","authenticated-orcid":false,"given":"Yizhou","family":"Wang","sequence":"first","affiliation":[{"name":"Department of Information Engineering, The Chinese University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7585-4903","authenticated-orcid":false,"given":"Yixuan","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Medicine, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-3505-0056","authenticated-orcid":false,"given":"Weizhen","family":"He","sequence":"additional","affiliation":[{"name":"College of Electrical Engineering, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9140-4724","authenticated-orcid":false,"given":"Xun","family":"Guo","sequence":"additional","affiliation":[{"name":"Department of Automation, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4309-170X","authenticated-orcid":false,"given":"Feng","family":"Zhu","sequence":"additional","affiliation":[{"name":"SenseTime Group Ltd., Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3378-7201","authenticated-orcid":false,"given":"Lei","family":"Bai","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5874-131X","authenticated-orcid":false,"given":"Rui","family":"Zhao","sequence":"additional","affiliation":[{"name":"SenseTime Group Ltd., Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3230-6392","authenticated-orcid":false,"given":"Jian","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Public Health, Zhejiang University, Hangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2772-9320","authenticated-orcid":false,"given":"Tong","family":"He","sequence":"additional","affiliation":[{"name":"Shanghai Artificial Intelligence Laboratory, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9163-2761","authenticated-orcid":false,"given":"Wanli","family":"Ouyang","sequence":"additional","affiliation":[{"name":"Department of Information Engineering, The Chinese University of Hong Kong, Hong Kong, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-0067-339X","authenticated-orcid":false,"given":"Shixiang","family":"Tang","sequence":"additional","affiliation":[{"name":"CUHK Interdisciplinary Artificial Intelligence Research Institute, Hong Kong, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01385"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01711"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2740564"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3079910"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2020.3033165"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01451"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2892985"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.261"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00761"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3222784"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3596711.3596800"},{"key":"ref12","first-page":"1","article-title":"Decomposing the immeasurable sport: A deep learning expected possession value framework for soccer","volume-title":"Proc. 13th MIT Sloan Sports Analytics Conf.","author":"Fern\u00e1ndez"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330758"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00394"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-023-35190-9"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.14313\/jamris\/3-2022\/22"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1155\/2022\/2130172"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3610875"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/INDIN41052.2019.8972114"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3613810"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/PHM-Nanjing52125.2021.9613117"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3215746"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01376"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00590"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2022.3192989"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2021.3108771"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2542983"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01445"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01568"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02104"},{"key":"ref31","article-title":"HAP: Structure-aware masked image modeling for human-centric perception","author":"Yuan","year":"2023"},{"key":"ref32","article-title":"SMPLer-x: Scaling up expressive human pose and shape estimation","author":"Cai","year":"2023"},{"key":"ref34","article-title":"LLaMA: Open and efficient foundation language models","author":"Touvron","year":"2023"},{"key":"ref35","article-title":"InternLM: A multilingual language model with progressively enhanced capabilities","year":"2023"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1016\/j.aiopen.2021.01.001"},{"key":"ref37","first-page":"7281","article-title":"HRFormer: High-resolution vision transformer for dense predict","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Yuan"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00443"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00199"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19769-7_20"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2876404"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00584"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01112"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00093"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12328"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01230"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01308"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01317"},{"key":"ref52","article-title":"Unik: A unified framework for real-world skeleton-based action recognition","author":"Yang","year":"2021"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01311"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00026"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1016\/j.cviu.2021.103219"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-69541-5_3"},{"key":"ref57","article-title":"Spatio-temporal tuples transformer for skeleton-based action recognition","author":"Qiu","year":"2022"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19775-8_33"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2023\/203"},{"key":"ref60","first-page":"1","article-title":"Explicit box detection unifies end-to-end multi-person pose estimation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Yang"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01350"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.715"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3174529"},{"key":"ref64","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.544"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP43922.2022.9747631"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01457"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01443"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00907"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19836-6_20"},{"key":"ref71","article-title":"Masked vision and language modeling for multi-modal representation learning","author":"Kwon","year":"2022"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00233"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02157"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00660"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01208"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/iccv51070.2023.00110"},{"key":"ref77","article-title":"LAMM: Language-assisted multi-modal instruction-tuning dataset, framework, and benchmark","author":"Yin","year":"2023"},{"key":"ref78","article-title":"Shikra: Unleashing multimodal LLM\u2019s referential dialogue magic","author":"Chen","year":"2023"},{"key":"ref79","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","author":"Li","year":"2023"},{"key":"ref80","article-title":"Visual instruction tuning","author":"Liu","year":"2023"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2025.3571946"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.naacl-long.440"},{"key":"ref83","article-title":"PandaGPT: One model to instruction-follow them all","author":"Su","year":"2023"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72698-9_8"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00120"},{"key":"ref86","article-title":"LLark: A multimodal foundation model for music","author":"Gardner","year":"2023"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28567"},{"key":"ref88","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"ref89","article-title":"LLaMA-adapter: Efficient fine-tuning of language models with zero-init attention","author":"Zhang","year":"2023"},{"key":"ref90","article-title":"PonderV2: Pave the way for 3D foundataion model with a universal pre-training paradigm","author":"Zhu","year":"2023"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01474"},{"key":"ref92","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref93","article-title":"Unified-IO: A unified model for vision, language, and multi-modal tasks","author":"Lu","year":"2022"},{"key":"ref94","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2018"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01249-6_37"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.248"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1109\/ICME.2019.00256"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_47"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2878349"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.46"},{"key":"ref102","article-title":"CrowdHuman: A benchmark for detecting human in a crowd","author":"Shao","year":"2018"},{"key":"ref103","article-title":"PLIP: Language-image pre-training for person representation learning","author":"Zuo","year":"2023"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3048039"},{"issue":"8","key":"ref105","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00542"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW.2015.51"},{"key":"ref109","article-title":"The Eurocity persons dataset: A novel benchmark for object detection","author":"Braun","year":"2018"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2929005"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr.2016.115"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00269"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01231-1_32"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.551"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1109\/3DV.2018.00024"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3450537"},{"key":"ref117","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"ref119","article-title":"A study of checkpointing in large scale training of deep neural networks","author":"Rojas","year":"2020"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20077-9_17"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01223"},{"key":"ref123","article-title":"V2F-Net: Explicit decomposition of occluded pedestrian detection","author":"Shang","year":"2021"},{"key":"ref124","article-title":"DETR for crowd pedestrian detection","author":"Lin","year":"2020"},{"key":"ref125","article-title":"Deformable DETR: Deformable transformers for end-to-end object detection","author":"Zhu","year":"2020"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01422"},{"key":"ref127","article-title":"ViTPose: Simple vision transformer baselines for human pose estimation","author":"Xu","year":"2022"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00100"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i1.19991"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3330016"},{"key":"ref131","article-title":"Openmmlab pose estimation toolbox and benchmark","year":"2020"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2916873"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00022"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00298"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01282"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00516"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01094"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01140"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20065-6_34"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19769-7_24"},{"key":"ref141","first-page":"12888","article-title":"BLIP: Bootstrapping language-image pre-training for unified vision-language understanding and generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref142","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.488"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11026037\/10930828.pdf?arnumber=10930828","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T04:19:08Z","timestamp":1749183548000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10930828\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7]]},"references-count":141,"journal-issue":{"issue":"7"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3552604","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,7]]}}}