{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T01:36:52Z","timestamp":1772501812394,"version":"3.50.1"},"publisher-location":"Cham","reference-count":71,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729515","type":"print"},{"value":"9783031729522","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,1]],"date-time":"2024-10-01T00:00:00Z","timestamp":1727740800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72952-2_15","type":"book-chapter","created":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T05:02:02Z","timestamp":1727672522000},"page":"249-268","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["X-Pose: Detecting Any Keypoints"],"prefix":"10.1007","author":[{"given":"Jie","family":"Yang","sequence":"first","affiliation":[]},{"given":"Ailing","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Ruimao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Lei","family":"Zhang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,1]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Cao, J., Tang, H., Fang, H.S., Shen, X., Lu, C., Tai, Y.W.: Cross-domain adaptation for animal pose estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9498\u20139507 (2019)","DOI":"10.1109\/ICCV.2019.00959"},{"key":"15_CR2","unstructured":"Chen, L.H., et al.: MotionLLM: understanding human behaviors from human motions and videos. arXiv preprint arXiv:2405.20340 (2024)"},{"key":"15_CR3","doi-asserted-by":"crossref","unstructured":"Chen, Z., et\u00a0al.: InternVL: Scaling up vision foundation models and aligning for generic visual-linguistic tasks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 24185\u201324198 (2024)","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, B., Xiao, B., Wang, J., Shi, H., Huang, T.S., Zhang, L.: HigherHRNet: scale-aware representation learning for bottom-up human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5386\u20135395 (2020)","DOI":"10.1109\/CVPR42600.2020.00543"},{"key":"15_CR5","unstructured":"Finn, C., Abbeel, P., Levine, S.: Model-agnostic meta-learning for fast adaptation of deep networks. In: International Conference on Machine Learning, pp. 1126\u20131135. PMLR (2017)"},{"key":"15_CR6","doi-asserted-by":"publisher","first-page":"1120","DOI":"10.1109\/TIP.2021.3131033","volume":"31","author":"Y Ge","year":"2021","unstructured":"Ge, Y., Zhang, R., Luo, P.: MetaCloth: learning unseen tasks of dense fashion landmark detection from a few samples. IEEE Trans. Image Process. 31, 1120\u20131133 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"15_CR7","doi-asserted-by":"crossref","unstructured":"Ge, Y., Zhang, R., Wang, X., Tang, X., Luo, P.: DeepFashion2: a versatile benchmark for detection, pose estimation, segmentation and re-identification of clothing images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5337\u20135345 (2019)","DOI":"10.1109\/CVPR.2019.00548"},{"key":"15_CR8","doi-asserted-by":"crossref","unstructured":"Geng, Z., Wang, C., Wei, Y., Liu, Z., Li, H., Hu, H.: Human pose as compositional tokens. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 660\u2013671 (2023)","DOI":"10.1109\/CVPR52729.2023.00071"},{"key":"15_CR9","doi-asserted-by":"publisher","DOI":"10.7554\/eLife.47994","volume":"8","author":"JM Graving","year":"2019","unstructured":"Graving, J.M., et al.: Deepposekit, a software toolkit for fast and robust animal pose estimation using deep learning. Elife 8, e47994 (2019)","journal-title":"Elife"},{"key":"15_CR10","unstructured":"Gu, X., Lin, T.Y., Kuo, W., Cui, Y.: Open-vocabulary object detection via vision and language knowledge distillation. arXiv preprint arXiv:2104.13921 (2021)"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16000\u201316009 (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"15_CR12","doi-asserted-by":"crossref","unstructured":"He, X., Bharaj, G., Ferman, D., Rhodin, H., Garrido, P.: Few-shot geometry-aware keypoint localization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 21337\u201321348 (2023)","DOI":"10.1109\/CVPR52729.2023.02044"},{"key":"15_CR13","unstructured":"Jiang, Q., et al.: T-rex: counting by visual prompting. arXiv preprint arXiv:2311.13596 (2023)"},{"key":"15_CR14","doi-asserted-by":"crossref","unstructured":"Jiang, Q., Li, F., Zeng, Z., Ren, T., Liu, S., Zhang, L.: T-rex2: towards generic object detection via text-visual prompt synergy. arXiv preprint arXiv:2403.14610 (2024)","DOI":"10.1007\/978-3-031-73414-4_3"},{"key":"15_CR15","unstructured":"Jiang, T., et al.: RTMPose: real-time multi-person pose estimation based on mmpose. arXiv preprint arXiv:2303.07399 (2023)"},{"key":"15_CR16","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Wang, J., Xu, Q., Zhang, L.: Human-art: a versatile human-centric dataset bridging natural and artificial scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 618\u2013629 (2023)","DOI":"10.1109\/CVPR52729.2023.00067"},{"key":"15_CR17","doi-asserted-by":"crossref","unstructured":"Khan, M.H., et al.: AnimalWeb: a large-scale hierarchical dataset of annotated animal faces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6939\u20136948 (2020)","DOI":"10.1109\/CVPR42600.2020.00697"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Kirillov, A., et\u00a0al.: Segment anything. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4015\u20134026 (2023)","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"15_CR19","doi-asserted-by":"publisher","DOI":"10.3389\/fnbeh.2020.581154","volume":"14","author":"R Labuguen","year":"2021","unstructured":"Labuguen, R., et al.: Macaquepose: a novel \u201cin the wild\u2019\u2019 macaque monkey pose dataset for markerless motion capture. Front. Behav. Neurosci. 14, 581154 (2021)","journal-title":"Front. Behav. Neurosci."},{"issue":"4","key":"15_CR20","doi-asserted-by":"publisher","first-page":"496","DOI":"10.1038\/s41592-022-01443-0","volume":"19","author":"J Lauer","year":"2022","unstructured":"Lauer, J., et al.: Multi-animal pose estimation, identification and tracking with deeplabcut. Nat. Methods 19(4), 496\u2013504 (2022)","journal-title":"Nat. Methods"},{"key":"15_CR21","unstructured":"Li, F., et al.: Semantic-SAM: segment and recognize anything at any granularity. arXiv preprint arXiv:2307.04767 (2023)"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Li, H., et al.: TAPTR: tracking any point with transformers as detection. arXiv preprint arXiv:2403.13042 (2024)","DOI":"10.1007\/978-3-031-72640-8_4"},{"key":"15_CR23","doi-asserted-by":"crossref","unstructured":"Li, K., Wang, S., Zhang, X., Xu, Y., Xu, W., Tu, Z.: Pose recognition with cascade transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1944\u20131953 (2021)","DOI":"10.1109\/CVPR46437.2021.00198"},{"key":"15_CR24","doi-asserted-by":"crossref","unstructured":"Li, L.H., et\u00a0al.: Grounded language-image pre-training. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10965\u201310975 (2022)","DOI":"10.1109\/CVPR52688.2022.01069"},{"key":"15_CR25","doi-asserted-by":"crossref","unstructured":"Liang, F., et al.: Open-vocabulary semantic segmentation with mask-adapted clip. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7061\u20137070 (2023)","DOI":"10.1109\/CVPR52729.2023.00682"},{"key":"15_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Liu, H., et\u00a0al.: Group pose: a simple baseline for end-to-end multi-person pose estimation. arXiv preprint arXiv:2308.07313 (2023)","DOI":"10.1109\/ICCV51070.2023.01380"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Liu, S., et\u00a0al.: Grounding DINO: marrying DINO with grounded pre-training for open-set object detection. arXiv preprint arXiv:2303.05499 (2023)","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Lu, C., Koniusz, P.: Few-shot keypoint detection with uncertainty learning for unseen species. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19416\u201319426 (2022)","DOI":"10.1109\/CVPR52688.2022.01881"},{"key":"15_CR30","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"72","DOI":"10.1007\/978-3-031-20068-7_5","volume-title":"ECCV 2022","author":"W Mao","year":"2022","unstructured":"Mao, W., et al.: Poseur: direct human pose regression with transformers. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 72\u201388. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_5"},{"key":"15_CR31","unstructured":"Mathis, A., et al.: DeepLabCut: markerless pose estimation of user-defined body parts with deep learning. Nat. Neurosci. (2018). https:\/\/www.nature.com\/articles\/s41593-018-0209-y"},{"issue":"1","key":"15_CR32","doi-asserted-by":"publisher","first-page":"99","DOI":"10.1145\/3503250","volume":"65","author":"B Mildenhall","year":"2021","unstructured":"Mildenhall, B., Srinivasan, P.P., Tancik, M., Barron, J.T., Ramamoorthi, R., Ng, R.: NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM 65(1), 99\u2013106 (2021)","journal-title":"Commun. ACM"},{"key":"15_CR33","unstructured":"Nakamura, A., Harada, T.: Revisiting fine-tuning for few-shot learning. arXiv preprint arXiv:1910.00216 (2019)"},{"key":"15_CR34","doi-asserted-by":"crossref","unstructured":"Ng, X.L., Ong, K.E., Zheng, Q., Ni, Y., Yeo, S.Y., Liu, J.: Animal kingdom: a large and diverse dataset for animal behavior understanding. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19023\u201319034 (2022)","DOI":"10.1109\/CVPR52688.2022.01844"},{"issue":"1","key":"15_CR35","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1038\/s41592-018-0234-5","volume":"16","author":"TD Pereira","year":"2019","unstructured":"Pereira, T.D., et al.: Fast animal pose estimation using deep neural networks. Nat. Methods 16(1), 117\u2013125 (2019)","journal-title":"Nat. Methods"},{"key":"15_CR36","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Reddy, N.D., Vo, M., Narasimhan, S.G.: CarFusion: combining point tracking and part detection for dynamic 3D reconstruction of vehicles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1906\u20131915 (2018)","DOI":"10.1109\/CVPR.2018.00204"},{"key":"15_CR38","unstructured":"Ren, T., et\u00a0al.: Grounded SAM: assembling open-world models for diverse visual tasks. arXiv preprint arXiv:2401.14159 (2024)"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Rezatofighi, H., Tsoi, N., Gwak, J., Sadeghian, A., Reid, I., Savarese, S.: Generalized intersection over union: a metric and a loss for bounding box regression. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 658\u2013666 (2019)","DOI":"10.1109\/CVPR.2019.00075"},{"key":"15_CR40","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.imavis.2016.01.002","volume":"47","author":"C Sagonas","year":"2016","unstructured":"Sagonas, C., Antonakos, E., Tzimiropoulos, G., Zafeiriou, S., Pantic, M.: 300 faces in-the-wild challenge: database and results. Image Vis. Comput. 47, 3\u201318 (2016)","journal-title":"Image Vis. Comput."},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Shi, D., Wei, X., Li, L., Ren, Y., Tan, W.: End-to-end multi-person pose estimation with transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11069\u201311078 (2022)","DOI":"10.1109\/CVPR52688.2022.01079"},{"key":"15_CR42","doi-asserted-by":"crossref","unstructured":"Shi, M., Huang, Z., Ma, X., Hu, X., Cao, Z.: Matching is not enough: a two-stage framework for category-agnostic pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7308\u20137317 (2023)","DOI":"10.1109\/CVPR52729.2023.00706"},{"key":"15_CR43","unstructured":"Snell, J., Swersky, K., Zemel, R.: Prototypical networks for few-shot learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"15_CR44","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5693\u20135703 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"15_CR45","unstructured":"Sun, M., et al.: UniAP: towards universal animal perception in vision via few-shot learning. arXiv preprint arXiv:2308.09953 (2023)"},{"key":"15_CR46","doi-asserted-by":"crossref","unstructured":"Sun, P., et al.: Going denser with open-vocabulary part segmentation. arXiv preprint arXiv:2305.11173 (2023)","DOI":"10.1109\/ICCV51070.2023.01417"},{"key":"15_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"15_CR48","unstructured":"Wang, W., et\u00a0al.: VisionLLM: large language model is also an open-ended decoder for vision-centric tasks. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"issue":"11","key":"15_CR49","doi-asserted-by":"publisher","first-page":"3258","DOI":"10.1109\/TCSVT.2018.2879980","volume":"29","author":"Y Wang","year":"2018","unstructured":"Wang, Y., Peng, C., Liu, Y.: Mask-pose cascaded CNN for 2D hand pose estimation from single color image. IEEE Trans. Circuits Syst. Video Technol. 29(11), 3258\u20133268 (2018)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"15_CR50","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"365","DOI":"10.1007\/978-3-319-46466-4_22","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Wu","year":"2016","unstructured":"Wu, J., et al.: Single image 3D interpreter network. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 365\u2013382. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_22"},{"key":"15_CR51","doi-asserted-by":"crossref","unstructured":"Xiao, B., Wu, H., Wei, Y.: Simple baselines for human pose estimation and tracking. In: Proceedings of the European Conference on Computer Vision (ECCV), pp. 466\u2013481 (2018)","DOI":"10.1007\/978-3-030-01231-1_29"},{"key":"15_CR52","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1007\/978-3-031-20068-7_23","volume-title":"ECCV 2022","author":"L Xu","year":"2022","unstructured":"Xu, L., et al.: Pose for everything: towards category-agnostic pose estimation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 398\u2013416. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_23"},{"key":"15_CR53","unstructured":"Xu, Y., Zhang, J., Zhang, Q., Tao, D.: ViTPose: simple vision transformer baselines for human pose estimation. In: Advances in Neural Information Processing Systems, vol. 35, pp. 38571\u201338584 (2022)"},{"key":"15_CR54","unstructured":"Xu, Y., Zhang, J., Zhang, Q., Tao, D.: ViTPose+: vision transformer foundation model for generic body pose estimation. arXiv preprint arXiv:2212.04246 (2022)"},{"key":"15_CR55","unstructured":"Yang, J., Li, B., Yang, F., Zeng, A., Zhang, L., Zhang, R.: Boosting human-object interaction detection with text-to-image diffusion model. arXiv preprint arXiv:2305.12252 (2023)"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Yang, J., Li, B., Zeng, A., Zhang, L., Zhang, R.: Open-world human-object interaction detection via multi-modal prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16954\u201316964 (2024)","DOI":"10.1109\/CVPR52733.2024.01604"},{"key":"15_CR57","doi-asserted-by":"crossref","unstructured":"Yang, J., Wang, C., Li, Z., Wang, J., Zhang, R.: Semantic human parsing via scalable semantic transfer over multiple label domains. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19424\u201319433 (2023)","DOI":"10.1109\/CVPR52729.2023.01861"},{"key":"15_CR58","doi-asserted-by":"crossref","unstructured":"Yang, J., Zeng, A., Li, F., Liu, S., Zhang, R., Zhang, L.: Neural interactive keypoint detection. arXiv preprint arXiv:2308.10174 (2023)","DOI":"10.1109\/ICCV51070.2023.01388"},{"key":"15_CR59","unstructured":"Yang, J., Zeng, A., Liu, S., Li, F., Zhang, R., Zhang, L.: Explicit box detection unifies end-to-end multi-person pose estimation. In: The Eleventh International Conference on Learning Representations (2022)"},{"key":"15_CR60","unstructured":"Yang, J., Zhu, Y., Wang, C., Li, Z., Zhang, R.: Toward unpaired multi-modal medical image segmentation via learning structured semantic consistency. arXiv preprint arXiv:2206.10571 (2022)"},{"key":"15_CR61","unstructured":"Yang, Y., Yang, J., Xu, Y., Zhang, J., Lan, L., Tao, D.: APT-36K: a large-scale benchmark for animal pose estimation and tracking. In: Advances in Neural Information Processing Systems, vol. 35, pp. 17301\u201317313 (2022)"},{"key":"15_CR62","unstructured":"Yao, L., et al.: DetCLIP: dictionary-enriched visual-concept paralleled pre-training for open-world detection. In: Advances in Neural Information Processing Systems, vol. 35, pp. 9125\u20139138 (2022)"},{"key":"15_CR63","unstructured":"Ye, S., et al.: Superanimal models pretrained for plug-and-play analysis of animal behavior. arXiv preprint arXiv:2203.07436 (2022)"},{"key":"15_CR64","unstructured":"Yu, H., Xu, Y., Zhang, J., Zhao, W., Guan, Z., Tao, D.: AP-10K: a benchmark for animal pose estimation in the wild. arXiv preprint arXiv:2108.12617 (2021)"},{"key":"15_CR65","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1007\/978-3-031-20077-9_7","volume-title":"ECCV 2022","author":"Y Zang","year":"2022","unstructured":"Zang, Y., Li, W., Zhou, K., Huang, C., Loy, C.C.: Open-vocabulary DETR with conditional matching. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13669, pp. 106\u2013122. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20077-9_7"},{"key":"15_CR66","unstructured":"Zhang, H., et al.: DINO: DETR with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605 (2022)"},{"key":"15_CR67","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wang, W., Chen, Z., Xu, Y., Zhang, J., Tao, D.: Clamp: prompt-based contrastive learning for connecting language and animal pose. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 23272\u201323281 (2023)","DOI":"10.1109\/CVPR52729.2023.02229"},{"key":"15_CR68","doi-asserted-by":"crossref","unstructured":"Zhong, Y., et\u00a0al.: RegionCLIP: region-based language-image pretraining. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 16793\u201316803 (2022)","DOI":"10.1109\/CVPR52688.2022.01629"},{"key":"15_CR69","doi-asserted-by":"crossref","unstructured":"Zhou, M., Stoffl, L., Mathis, M.W., Mathis, A.: Rethinking pose estimation in crowds: overcoming the detection information bottleneck and ambiguity. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV), pp. 14689\u201314699 (2023)","DOI":"10.1109\/ICCV51070.2023.01350"},{"key":"15_CR70","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"15_CR71","unstructured":"Zou, X., et al.: Segment everything everywhere all at once. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72952-2_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:41:04Z","timestamp":1732830064000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72952-2_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,1]]},"ISBN":["9783031729515","9783031729522"],"references-count":71,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72952-2_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,1]]},"assertion":[{"value":"1 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}