{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T18:00:07Z","timestamp":1775325607036,"version":"3.50.1"},"publisher-location":"Cham","reference-count":66,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726231","type":"print"},{"value":"9783031726248","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72624-8_20","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T09:52:13Z","timestamp":1729849933000},"page":"342-360","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":10,"title":["Harnessing Text-to-Image Diffusion Models for\u00a0Category-Agnostic Pose Estimation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3281-0772","authenticated-orcid":false,"given":"Duo","family":"Peng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0000-8956-0095","authenticated-orcid":false,"given":"Zhengbo","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3121-9852","authenticated-orcid":false,"given":"Ping","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9998-3614","authenticated-orcid":false,"given":"Qiuhong","family":"Ke","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9061-7423","authenticated-orcid":false,"given":"David K. Y.","family":"Yau","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4365-4165","authenticated-orcid":false,"given":"Jun","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"20_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"717","DOI":"10.1007\/978-3-319-46478-7_44","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Bulat","year":"2016","unstructured":"Bulat, A., Tzimiropoulos, G.: Human pose estimation via convolutional part heatmap regression. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 717\u2013732. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_44"},{"key":"20_CR2","doi-asserted-by":"crossref","unstructured":"Bulat, A., Tzimiropoulos, G.: How far are we from solving the 2D & 3D face alignment problem?(and a dataset of 230,000 3D facial landmarks). In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1021\u20131030 (2017)","DOI":"10.1109\/ICCV.2017.116"},{"key":"20_CR3","doi-asserted-by":"crossref","unstructured":"Cao, Z., Simon, T., Wei, S.E., Sheikh, Y.: Realtime multi-person 2D pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7291\u20137299 (2017)","DOI":"10.1109\/CVPR.2017.143"},{"key":"20_CR4","doi-asserted-by":"crossref","unstructured":"Chan, C., Ginosar, S., Zhou, T., Efros, A.A.: Everybody dance now. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 5933\u20135942 (2019)","DOI":"10.1109\/ICCV.2019.00603"},{"key":"20_CR5","unstructured":"Finn, C., Abbeel, P., Levine, S.: Model-agnostic meta-learning for fast adaptation of deep networks. In: International Conference on Machine Learning, pp. 1126\u20131135. PMLR (2017)"},{"issue":"14\u201315","key":"20_CR6","doi-asserted-by":"publisher","first-page":"2627","DOI":"10.1016\/S1352-2310(97)00447-0","volume":"32","author":"MW Gardner","year":"1998","unstructured":"Gardner, M.W., Dorling, S.: Artificial neural networks (the multilayer perceptron)-a review of applications in the atmospheric sciences. Atmos. Environ. 32(14\u201315), 2627\u20132636 (1998)","journal-title":"Atmos. Environ."},{"key":"20_CR7","doi-asserted-by":"crossref","unstructured":"Ge, L., Cai, Y., Weng, J., Yuan, J.: Hand PointNet: 3D hand pose estimation using point sets. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 8417\u20138426 (2018)","DOI":"10.1109\/CVPR.2018.00878"},{"key":"20_CR8","doi-asserted-by":"crossref","unstructured":"Ge, L., et al.: 3D hand shape and pose estimation from a single RGB image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10833\u201310842 (2019)","DOI":"10.1109\/CVPR.2019.01109"},{"key":"20_CR9","doi-asserted-by":"crossref","unstructured":"Ge, Y., Zhang, R., Wang, X., Tang, X., Luo, P.: DeepFashion2: a versatile benchmark for detection, pose estimation, segmentation and re-identification of clothing images. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 5337\u20135345 (2019)","DOI":"10.1109\/CVPR.2019.00548"},{"key":"20_CR10","doi-asserted-by":"crossref","unstructured":"Gilroy, S., Glavin, M., Jones, E., Mullins, D.: Pedestrian occlusion level classification using keypoint detection and 2d body surface area estimation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3833\u20133839 (2021)","DOI":"10.1109\/ICCVW54120.2021.00427"},{"key":"20_CR11","doi-asserted-by":"crossref","unstructured":"Gong, J., Fan, Z., Ke, Q., Rahmani, H., Liu, J.: Meta agent teaming active learning for pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11079\u201311089 (2022)","DOI":"10.1109\/CVPR52688.2022.01080"},{"key":"20_CR12","doi-asserted-by":"publisher","first-page":"e47994","DOI":"10.7554\/eLife.47994","volume":"8","author":"JM Graving","year":"2019","unstructured":"Graving, J.M., et al.: DeepPoseKit, a software toolkit for fast and robust animal pose estimation using deep learning. Elife 8, e47994 (2019)","journal-title":"Elife"},{"key":"20_CR13","doi-asserted-by":"crossref","unstructured":"Guleryuz, O.G., Kaeser-Chen, C.: Fast lifting for 3D hand pose estimation in AR\/VR applications. In: 2018 25th IEEE International Conference on Image Processing (ICIP), pp. 106\u2013110 (2018)","DOI":"10.1109\/ICIP.2018.8451559"},{"key":"20_CR14","doi-asserted-by":"crossref","unstructured":"Huang, M.H., Foo, L.G., Liu, J.: Learning to unlearn for robust machine unlearning. In: European Conference on Computer Vision. Springer, Heidelberg (2024)","DOI":"10.1007\/978-3-031-72943-0_12"},{"key":"20_CR15","unstructured":"Huang, S., Qi, S., Xiao, Y., Zhu, Y., Wu, Y.N., Zhu, S.C.: Cooperative holistic scene understanding: unifying 3D object, layout, and camera pose estimation. In: Advances in Neural Information Processing Systems, vol. 31 (2018)"},{"key":"20_CR16","unstructured":"Hui, X., Wu, Q., Rahmani, H., Liu, J.: Class-agnostic object counting with text-to-image diffusion model. In: European Conference on Computer Vision. Springer, Heidelberg (2024)"},{"issue":"21","key":"20_CR17","doi-asserted-by":"publisher","first-page":"3551","DOI":"10.3390\/electronics11213551","volume":"11","author":"S Iftikhar","year":"2022","unstructured":"Iftikhar, S., Zhang, Z., Asim, M., Muthanna, A., Koucheryavy, A., Abd El-Latif, A.A.: Deep learning-based pedestrian detection in autonomous vehicles: substantial issues and challenges. Electronics 11(21), 3551 (2022)","journal-title":"Electronics"},{"key":"20_CR18","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"20_CR19","doi-asserted-by":"crossref","unstructured":"Khan, M.H., et al.: AnimalWeb: a large-scale hierarchical dataset of annotated animal faces. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6939\u20136948 (2020)","DOI":"10.1109\/CVPR42600.2020.00697"},{"key":"20_CR20","unstructured":"Khani, A., Taghanaki, S.A., Sanghi, A., Amiri, A.M., Hamarneh, G.: SLiME: segment like me. arXiv preprint arXiv:2309.03179 (2023)"},{"key":"20_CR21","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"20_CR22","doi-asserted-by":"crossref","unstructured":"Koestinger, M., Wohlhart, P., Roth, P.M., Bischof, H.: Annotated facial landmarks in the wild: A large-scale, real-world database for facial landmark localization. In: 2011 IEEE International Conference on Computer Vision Workshops (ICCV Workshops), pp. 2144\u20132151. IEEE (2011)","DOI":"10.1109\/ICCVW.2011.6130513"},{"key":"20_CR23","doi-asserted-by":"crossref","unstructured":"Krau\u00df, V., Boden, A., Oppermann, L., Reiners, R.: Current practices, challenges, and design implications for collaborative AR\/VR application development. In: Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems, pp. 1\u201315 (2021)","DOI":"10.1145\/3411764.3445335"},{"key":"20_CR24","doi-asserted-by":"publisher","first-page":"581154","DOI":"10.3389\/fnbeh.2020.581154","volume":"14","author":"R Labuguen","year":"2021","unstructured":"Labuguen, R., et al.: MacaquePose: a novel \u201cin the wild\u2019\u2019 macaque monkey pose dataset for markerless motion capture. Front. Behav. Neurosci. 14, 581154 (2021)","journal-title":"Front. Behav. Neurosci."},{"key":"20_CR25","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"20_CR26","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: zero-shot one image to 3d object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"20_CR27","doi-asserted-by":"crossref","unstructured":"Liu, Z., Chen, Z., Bai, J., Li, S., Lian, S.: Facial pose estimation by deep learning from label distributions. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision Workshops (2019)","DOI":"10.1109\/ICCVW.2019.00156"},{"key":"20_CR28","unstructured":"Nakamura, A., Harada, T.: Revisiting fine-tuning for few-shot learning. arXiv preprint arXiv:1910.00216 (2019)"},{"key":"20_CR29","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"483","DOI":"10.1007\/978-3-319-46484-8_29","volume-title":"Computer Vision \u2013 ECCV 2016","author":"A Newell","year":"2016","unstructured":"Newell, A., Yang, K., Deng, J.: Stacked hourglass networks for human pose estimation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9912, pp. 483\u2013499. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46484-8_29"},{"key":"20_CR30","doi-asserted-by":"crossref","unstructured":"Nguyen, L.X., Aung, P.S., Le, H.Q., Park, S.B., Hong, C.S.: A new chapter for medical image generation: the stable diffusion method. In: 2023 International Conference on Information Networking (ICOIN), pp. 483\u2013486. IEEE (2023)","DOI":"10.1109\/ICOIN56518.2023.10049010"},{"key":"20_CR31","unstructured":"Nichol, A.Q., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. In: Proceedings of the 39th International Conference on Machine Learning, vol.\u00a0162, pp. 16784\u201316804. PMLR (2022)"},{"key":"20_CR32","doi-asserted-by":"crossref","unstructured":"Peng, H., et\u00a0al.: The multi-modal video reasoning and analyzing competition. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 806\u2013813 (2021)","DOI":"10.1109\/ICCVW54120.2021.00095"},{"issue":"1","key":"20_CR33","doi-asserted-by":"publisher","first-page":"117","DOI":"10.1038\/s41592-018-0234-5","volume":"16","author":"TD Pereira","year":"2019","unstructured":"Pereira, T.D., et al.: Fast animal pose estimation using deep neural networks. Nat. Methods 16(1), 117\u2013125 (2019)","journal-title":"Nat. Methods"},{"key":"20_CR34","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/978-3-319-48881-3_20","volume-title":"Computer Vision \u2013 ECCV 2016 Workshops","author":"T Probst","year":"2016","unstructured":"Probst, T., Fossati, A., Van Gool, L.: Combining human body shape and pose estimation for robust upper body tracking using a depth sensor. In: Hua, G., J\u00e9gou, H. (eds.) ECCV 2016. LNCS, vol. 9914, pp. 285\u2013301. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-48881-3_20"},{"key":"20_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"33","DOI":"10.1007\/978-3-319-10605-2_3","volume-title":"Computer Vision \u2013 ECCV 2014","author":"V Ramakrishna","year":"2014","unstructured":"Ramakrishna, V., Munoz, D., Hebert, M., Andrew Bagnell, J., Sheikh, Y.: Pose machines: articulated pose estimation via inference machines. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8690, pp. 33\u201347. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10605-2_3"},{"key":"20_CR36","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"20_CR37","doi-asserted-by":"crossref","unstructured":"Reddy, N.D., Vo, M., Narasimhan, S.G.: CarFusion: combining point tracking and part detection for dynamic 3D reconstruction of vehicles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1906\u20131915 (2018)","DOI":"10.1109\/CVPR.2018.00204"},{"key":"20_CR38","doi-asserted-by":"crossref","unstructured":"Reddy, N.D., Vo, M., Narasimhan, S.G.: Occlusion-net: 2D\/3D occluded keypoint localization using graph networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7326\u20137335 (2019)","DOI":"10.1109\/CVPR.2019.00750"},{"key":"20_CR39","doi-asserted-by":"crossref","unstructured":"Reed, S., Akata, Z., Lee, H., Schiele, B.: Learning deep representations of fine-grained visual descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.13"},{"key":"20_CR40","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"20_CR41","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015, Part III. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"20_CR42","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.imavis.2016.01.002","volume":"47","author":"C Sagonas","year":"2016","unstructured":"Sagonas, C., Antonakos, E., Tzimiropoulos, G., Zafeiriou, S., Pantic, M.: 300 faces in-the-wild challenge: database and results. Image Vis. Comput. 47, 3\u201318 (2016)","journal-title":"Image Vis. Comput."},{"key":"20_CR43","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol. 35, pp. 36479\u201336494 (2022)"},{"key":"20_CR44","doi-asserted-by":"publisher","first-page":"132539","DOI":"10.1109\/ACCESS.2020.3010307","volume":"8","author":"HC S\u00e1nchez","year":"2020","unstructured":"S\u00e1nchez, H.C., Mart\u00ednez, A.H., Gonzalo, R.I., Parra, N.H., Alonso, I.P., Fernandez-Llorca, D.: Simple baseline for vehicle pose estimation: Experimental validation. IEEE Access 8, 132539\u2013132550 (2020)","journal-title":"IEEE Access"},{"key":"20_CR45","doi-asserted-by":"crossref","unstructured":"Shi, M., Huang, Z., Ma, X., Hu, X., Cao, Z.: Matching is not enough: a two-stage framework for category-agnostic pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7308\u20137317 (2023)","DOI":"10.1109\/CVPR52729.2023.00706"},{"key":"20_CR46","unstructured":"Snell, J., Swersky, K., Zemel, R.: Prototypical networks for few-shot learning. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"20_CR47","doi-asserted-by":"crossref","unstructured":"Sun, K., Xiao, B., Liu, D., Wang, J.: Deep high-resolution representation learning for human pose estimation. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 5693\u20135703 (2019)","DOI":"10.1109\/CVPR.2019.00584"},{"key":"20_CR48","doi-asserted-by":"crossref","unstructured":"Sun, Y., Wang, X., Tang, X.: Deep convolutional network cascade for facial point detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3476\u20133483 (2013)","DOI":"10.1109\/CVPR.2013.446"},{"key":"20_CR49","doi-asserted-by":"crossref","unstructured":"Tang, W., Wu, Y.: Does learning specific features for related parts help human pose estimation? In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1107\u20131116 (2019)","DOI":"10.1109\/CVPR.2019.00120"},{"key":"20_CR50","doi-asserted-by":"crossref","unstructured":"Toshev, A., Szegedy, C.: DeepPose: human pose estimation via deep neural networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1653\u20131660 (2014)","DOI":"10.1109\/CVPR.2014.214"},{"key":"20_CR51","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"492","DOI":"10.1007\/978-3-030-58621-8_29","volume-title":"Computer Vision \u2013 ECCV 2020","author":"J Wang","year":"2020","unstructured":"Wang, J., Long, X., Gao, Y., Ding, E., Wen, S.: Graph-PCNN: two stage human pose estimation with graph pose refinement. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12356, pp. 492\u2013508. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58621-8_29"},{"issue":"11","key":"20_CR52","doi-asserted-by":"publisher","first-page":"3258","DOI":"10.1109\/TCSVT.2018.2879980","volume":"29","author":"Y Wang","year":"2018","unstructured":"Wang, Y., Peng, C., Liu, Y.: Mask-pose cascaded CNN for 2D hand pose estimation from single color image. IEEE Trans. Circuits Syst. Video Technol. 29(11), 3258\u20133268 (2018)","journal-title":"IEEE Trans. Circuits Syst. Video Technol."},{"key":"20_CR53","unstructured":"Welinder, P., et al.: Caltech-UCSD birds 200 (2010)"},{"key":"20_CR54","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"365","DOI":"10.1007\/978-3-319-46466-4_22","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Wu","year":"2016","unstructured":"Wu, J., et al.: Single image 3D interpreter network. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 365\u2013382. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_22"},{"key":"20_CR55","doi-asserted-by":"crossref","unstructured":"Xie, S., Zhang, Z., Lin, Z., Hinz, T., Zhang, K.: SmartBrush: text and shape guided object inpainting with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22428\u201322437 (2023)","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"20_CR56","doi-asserted-by":"crossref","unstructured":"Xu, L., Huang, M.H., Shang, X., Yuan, Z., Sun, Y., Liu, J.: Meta compositional referring expression segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19478\u201319487 (2023)","DOI":"10.1109\/CVPR52729.2023.01866"},{"key":"20_CR57","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1007\/978-3-031-20068-7_23","volume-title":"Computer Vision \u2013 ECCV 2022","author":"L Xu","year":"2022","unstructured":"Xu, L., et al.: Pose for everything: towards category-agnostic pose estimation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13666, pp. 398\u2013416. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-20068-7_23"},{"key":"20_CR58","doi-asserted-by":"crossref","unstructured":"Xu, W., Su, P.c., Sen-ching, S.C.: Human pose estimation using two RGB-D sensors. In: 2016 IEEE International Conference on Image Processing (ICIP), pp. 1279\u20131283. IEEE (2016)","DOI":"10.1109\/ICIP.2016.7532564"},{"issue":"12","key":"20_CR59","doi-asserted-by":"publisher","first-page":"2878","DOI":"10.1109\/TPAMI.2012.261","volume":"35","author":"Y Yang","year":"2012","unstructured":"Yang, Y., Ramanan, D.: Articulated human detection with flexible mixtures of parts. IEEE Trans. Pattern Anal. Mach. Intell. 35(12), 2878\u20132890 (2012)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"20_CR60","doi-asserted-by":"crossref","unstructured":"Zhang, D., Guo, G., Huang, D., Han, J.: PoseFlow: a deep motion representation for understanding human behaviors in videos. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6762\u20136770 (2018)","DOI":"10.1109\/CVPR.2018.00707"},{"key":"20_CR61","unstructured":"Zhang, J., Cai, Y., Yan, S., Feng, J., et al.: Direct multi-view multi-person 3D pose estimation. In: Advances in Neural Information Processing Systems, vol. 34, pp. 13153\u201313164 (2021)"},{"key":"20_CR62","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Xu, L., Peng, D., Rahmani, H., Liu, J.: Diff-tracker: text-to-image diffusion models are unsupervised trackers. In: European Conference on Computer Vision. Springer, Heidelberg (2024)","DOI":"10.1007\/978-3-031-73390-1_19"},{"key":"20_CR63","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Zhou, C., Tu, Z.: Distilling inter-class distance for semantic segmentation. arXiv preprint arXiv:2205.03650 (2022)","DOI":"10.24963\/ijcai.2022\/235"},{"issue":"4","key":"20_CR64","doi-asserted-by":"publisher","first-page":"787","DOI":"10.1109\/TIV.2021.3078619","volume":"6","author":"C Zhao","year":"2021","unstructured":"Zhao, C., Fu, C., Dolan, J.M., Wang, J.: L-shape fitting-based vehicle pose estimation and tracking using 3D-LiDAR. IEEE Trans. Intell. Veh. 6(4), 787\u2013798 (2021)","journal-title":"IEEE Trans. Intell. Veh."},{"key":"20_CR65","doi-asserted-by":"crossref","unstructured":"Zheng, C., Zhu, S., Mendieta, M., Yang, T., Chen, C., Ding, Z.: 3D human pose estimation with spatial and temporal transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 11656\u201311665 (2021)","DOI":"10.1109\/ICCV48922.2021.01145"},{"key":"20_CR66","doi-asserted-by":"crossref","unstructured":"Zimmermann, C., Brox, T.: Learning to estimate 3D hand pose from single RGB images. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4903\u20134911 (2017)","DOI":"10.1109\/ICCV.2017.525"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72624-8_20","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T07:46:59Z","timestamp":1732952819000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72624-8_20"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726231","9783031726248"],"references-count":66,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72624-8_20","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}