{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T00:25:04Z","timestamp":1778372704945,"version":"3.51.4"},"reference-count":47,"publisher":"Oxford University Press (OUP)","issue":"9","license":[{"start":{"date-parts":[[2025,8,5]],"date-time":"2025-08-05T00:00:00Z","timestamp":1754352000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100018537","name":"National Science and Technology Major Project","doi-asserted-by":"publisher","award":["2024ZD1300700"],"award-info":[{"award-number":["2024ZD1300700"]}],"id":[{"id":"10.13039\/501100018537","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,2]]},"abstract":"<jats:title>Abstract<\/jats:title>\n               <jats:p>Vision-language models with large-scale image-text pairs have shown significant potential on representation learning. Human pose estimation task, which is highly sensitive to pixel-wise transformation, requires effective methods for mining pose-specific knowledge. In this paper, we investigate the homologous human pose retrieval task relying on large-scale annotated datasets to enhance pose knowledge extraction. We propose Pose Prompt (PosePro), which leverages vision-language models to categorize global pose configuration of an image, build compatible design, generate pose embedding as proposals. We then aim to integrate the learned knowledge as visual and textual prompt to facilitate the learning processing of newly unseen tasks. We demonstrate the effectiveness of fundamental PosePro model through extensive experiments on both pose retrieval and human pose estimation, showing significant improvements in accuracy and generalization ability, especially in scenarios with limited samples.<\/jats:p>","DOI":"10.1093\/jcde\/qwaf079","type":"journal-article","created":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T12:26:17Z","timestamp":1756902377000},"page":"32-45","source":"Crossref","is-referenced-by-count":2,"title":["Vision-language model guided pose knowledge mining for human pose estimation"],"prefix":"10.1093","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5753-7728","authenticated-orcid":false,"given":"Yilei","family":"Chen","sequence":"first","affiliation":[{"name":"School of Artificial Intelligence, Xidian University , Xi\u2019an, Shaanxi 710071 ,","place":["PR China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7857-0845","authenticated-orcid":false,"given":"Xuemei","family":"Xie","sequence":"additional","affiliation":[{"name":"Guangzhou Institute of Technology, Xidian University , Guangzhou, Guangdong 510555 ,","place":["PR China"]},{"name":"Pazhou Lab , Huangpu, Guangzhou 510555 ,","place":["PR China"]}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fu","family":"Li","sequence":"additional","affiliation":[{"name":"School of Artificial Intelligence, Xidian University , Xi\u2019an, Shaanxi 710071 ,","place":["PR China"]}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"286","published-online":{"date-parts":[[2025,8,5]]},"reference":[{"key":"2025090308261284500_bib1","first-page":"3686","article-title":"2d human pose estimation: New benchmark and state of the art analysis","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Andriluka","year":"2014"},{"key":"2025090308261284500_bib2","first-page":"606","article-title":"Adversarial semantic data augmentation for human pose estimation","volume-title":"European Conference on Computer Vision","author":"Bin","year":"2020"},{"key":"2025090308261284500_bib3","doi-asserted-by":"publisher","first-page":"107410","DOI":"10.1016\/j.patcog.2020.107410","article-title":"Structure-aware human pose estimation with graph convolutional networks","volume":"106","author":"Bin","year":"2020","journal-title":"Pattern Recognition"},{"key":"2025090308261284500_bib4","first-page":"1221","article-title":"Adversarial posenet: A structure-aware convolutional network for human pose estimation","volume-title":"IEEE International Conference on Computer Vision","author":"Chen","year":"2017"},{"key":"2025090308261284500_bib5","doi-asserted-by":"publisher","first-page":"21012","DOI":"10.1007\/s10489-023-04521-8","article-title":"Structure guided network for human pose estimation","volume":"53","author":"Chen","year":"2023","journal-title":"Applied Intelligence"},{"key":"2025090308261284500_bib6","first-page":"248","article-title":"ImageNet: A large-scale hierarchical image database","volume-title":"IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops (CVPR Workshops)","author":"Deng","year":"2009"},{"key":"2025090308261284500_bib7","doi-asserted-by":"publisher","first-page":"106524","DOI":"10.1016\/j.neunet.2024.106524","article-title":"Boosting integral-based human pose estimation through implicit heatmap learning","volume":"179","author":"Du","year":"2024","journal-title":"Neural Networks"},{"key":"2025090308261284500_bib8","first-page":"14084","article-title":"Learning to prompt for open-vocabulary object detection with vision-language model","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Du","year":"2022"},{"key":"2025090308261284500_bib9","first-page":"13041","article-title":"Diffpose: Toward more reliable 3d pose estimation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Gong","year":"2023"},{"key":"2025090308261284500_bib10","first-page":"1","article-title":"Invariant representation learning for infant pose estimation with small data","volume-title":"IEEE International Conference on Automatic Face and Gesture Recognition","author":"Huang","year":"2021"},{"key":"2025090308261284500_bib11","doi-asserted-by":"publisher","first-page":"53","DOI":"10.1093\/jcde\/qwae108","article-title":"Knowledge graph-based multi-granularity tacit design knowledge reuse for product design","volume":"12","author":"Jia","year":"2024","journal-title":"Journal of Computational Design and Engineering"},{"key":"2025090308261284500_bib12","first-page":"12.1","article-title":"Clustered pose and nonlinear appearance models for human pose estimation","volume-title":"British Machine Vision Conference","author":"Johnson","year":"2010"},{"key":"2025090308261284500_bib13","article-title":"Human pose descriptions and subject-focused attention for improved zero-shot transfer in human-centric classification tasks","author":"Khan","year":"2024"},{"key":"2025090308261284500_bib14","first-page":"2288","article-title":"Deep metric learning beyond binary supervision","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Kim","year":"2019"},{"key":"2025090308261284500_bib15","first-page":"4938","article-title":"Thin-slicing for pose: Learning to understand pose without explicit pose estimation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Kwak","year":"2016"},{"key":"2025090308261284500_bib16","first-page":"2933","article-title":"Image-free domain generalization via clip for 3d hand pose estimation","volume-title":"IEEE Winter Conference on Applications of Computer Vision","author":"Lee","year":"2022"},{"key":"2025090308261284500_bib17","first-page":"6501","article-title":"Distilling detr with visual-linguistic knowledge for open-vocabulary object detection","volume-title":"IEEE International Conference on Computer Vision","author":"Li","year":"2023"},{"key":"2025090308261284500_bib18","doi-asserted-by":"publisher","first-page":"1355","DOI":"10.1093\/jcde\/qwab050","article-title":"Target unbiased meta-learning for graph classification","volume":"8","author":"Li","year":"2021","journal-title":"Journal of Computational Design and Engineering"},{"key":"2025090308261284500_bib19","first-page":"11293","article-title":"Tokenpose: Learning keypoint tokens for human pose estimation","volume-title":"IEEE International Conference on Computer Vision","author":"Li","year":"2021"},{"key":"2025090308261284500_bib20","first-page":"11720","article-title":"Online knowledge distillation for efficient pose estimation","volume-title":"IEEE International Conference on Computer Vision","author":"Li","year":"2021"},{"key":"2025090308261284500_bib21","first-page":"740","article-title":"Microsoft coco: Common objects in context","volume-title":"European Conference on Computer Vision","author":"Lin","year":"2014"},{"key":"2025090308261284500_bib22","doi-asserted-by":"publisher","first-page":"1082","DOI":"10.1007\/s00530-024-01368-y","article-title":"Sd-pose: facilitating space-decoupled human pose estimation via adaptive pose perception guidance","volume":"30","author":"Liu","year":"2024","journal-title":"Multimedia Systems"},{"key":"2025090308261284500_bib23","first-page":"128","article-title":"Overlock: An overview-first-look-closely-next convnet with context-mixing dynamic kernels","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Lou","year":"2025"},{"key":"2025090308261284500_bib24","first-page":"424","article-title":"Ppt: token-pruned pose transformer for monocular and multi-view human pose estimation","volume-title":"European Conference on Computer Vision","author":"Ma","year":"2022"},{"key":"2025090308261284500_bib25","first-page":"2345","article-title":"Posebits for monocular human pose estimation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Pons-Moll","year":"2014"},{"key":"2025090308261284500_bib26","unstructured":"Projects R. U. (2023). Fall detection dataset. Roboflow\u00a0Universe,\u00a0Roboflow."},{"key":"2025090308261284500_bib27","volume-title":"Learning structure-guided diffusion model for 2d human pose estimation","author":"Qiu","year":"2023"},{"key":"2025090308261284500_bib28","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International Conference on Machine Learning","author":"Radford","year":"2021"},{"key":"2025090308261284500_bib29","first-page":"18061","article-title":"Denseclip: Language-guided dense prediction with context-aware prompting","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Rao","year":"2022"},{"key":"2025090308261284500_bib30","first-page":"369","article-title":"Benchmarking and error diagnosis in multi-instance pose estimation","volume-title":"IEEE International Conference on Computer Vision","author":"Ronchi","year":"2017"},{"key":"2025090308261284500_bib31","doi-asserted-by":"publisher","first-page":"187","DOI":"10.1093\/jcde\/qwab075","article-title":"Anomaly detection with vision-based deep learning for epidemic prevention and control","volume":"9","author":"Samani","year":"2022","journal-title":"Journal of Computational Design and Engineering"},{"key":"2025090308261284500_bib32","first-page":"7464","article-title":"Yolov7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Wang","year":"2023"},{"key":"2025090308261284500_bib33","first-page":"1899","article-title":"Clip-guided prototype modulating for few-shot action recognition","volume-title":"International Journal of Computer Vision (IJCV)","author":"Wang","year":"2023"},{"key":"2025090308261284500_bib34","first-page":"28619","article-title":"Stronger fewer & superior: Harnessing vision foundation models for domain generalized semantic segmentation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Wei","year":"2024"},{"key":"2025090308261284500_bib35","first-page":"472","article-title":"Simple baselines for human pose estimation and tracking","volume-title":"European Conference on Computer Vision","author":"Xiao","year":"2018"},{"key":"2025090308261284500_bib37","first-page":"2955","article-title":"Adaptive hypergraph neural network for multi-person pose estimation","volume-title":"AAAI Conference on Artificial Intelligence","author":"Xu","year":"2022"},{"key":"2025090308261284500_bib38","first-page":"38571","article-title":"ViTPose: Simple vision transformer baselines for human pose estimation","volume-title":"Advances in Neural Information Processing Systems","author":"Xu","year":"2022"},{"key":"2025090308261284500_bib36","doi-asserted-by":"publisher","first-page":"436","DOI":"10.1109\/LSP.2020.2975426","article-title":"Integral knowledge distillation for multi-person pose estimation","volume":"27","author":"Xu","year":"2020","journal-title":"IEEE Signal Processing Letters"},{"key":"2025090308261284500_bib39","first-page":"19163","article-title":"Unified contrastive learning in image-text-label space","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Yang","year":"2022"},{"key":"2025090308261284500_bib40","first-page":"249","article-title":"X-pose: Detection any keypoints","volume-title":"European Conference on Computer Vision","author":"Yang","year":"2024"},{"key":"2025090308261284500_bib41","doi-asserted-by":"publisher","first-page":"3539","DOI":"10.1145\/3394486.3406466","article-title":"Learning with small data","volume-title":"26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining","author":"Yao","year":"2020"},{"key":"2025090308261284500_bib42","first-page":"3512","article-title":"Fast human pose estimation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Zhang","year":"2019"},{"key":"2025090308261284500_bib44","first-page":"889","article-title":"Pose2seg: Detection free human instance segmentation","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Zhang","year":"2019"},{"key":"2025090308261284500_bib43","first-page":"5625","article-title":"Vision-language models for vision tasks: A survey","volume-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence","author":"Zhang","year":"2024"},{"key":"2025090308261284500_bib45","first-page":"23272","article-title":"Clamp: Prompt-based contrastive learning for connecting language and animal pose","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Zhang","year":"2023"},{"key":"2025090308261284500_bib46","first-page":"16795","article-title":"Conditional prompt learning for vision-language models","volume-title":"IEEE Conference on Computer Vision and Pattern Recognition","author":"Zhou","year":"2022"},{"key":"2025090308261284500_bib47","doi-asserted-by":"publisher","first-page":"2337","DOI":"10.1007\/s11263-022-01653-1","article-title":"Learning to prompt for vision-language models","author":"Zhou","year":"2022","journal-title":"International Journal of Computer Vision (IJCV)"}],"container-title":["Journal of Computational Design and Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/academic.oup.com\/jcde\/article-pdf\/12\/9\/32\/63957093\/qwaf079.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/academic.oup.com\/jcde\/article-pdf\/12\/9\/32\/63957093\/qwaf079.pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,3]],"date-time":"2025-09-03T12:26:28Z","timestamp":1756902388000},"score":1,"resource":{"primary":{"URL":"https:\/\/academic.oup.com\/jcde\/article\/12\/9\/32\/8222500"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,5]]},"references-count":47,"journal-issue":{"issue":"9","published-print":{"date-parts":[[2025,9,2]]}},"URL":"https:\/\/doi.org\/10.1093\/jcde\/qwaf079","relation":{},"ISSN":["2288-5048"],"issn-type":[{"value":"2288-5048","type":"electronic"}],"subject":[],"published-other":{"date-parts":[[2025,9]]},"published":{"date-parts":[[2025,8,5]]}}}