{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T16:14:04Z","timestamp":1759940044004,"version":"3.40.3"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031734106"},{"type":"electronic","value":"9783031734113"}],"license":[{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,23]],"date-time":"2024-11-23T00:00:00Z","timestamp":1732320000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73411-3_12","type":"book-chapter","created":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T20:07:11Z","timestamp":1732306031000},"page":"201-217","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["HumanRefiner: Benchmarking Abnormal Human Generation and\u00a0Refining with\u00a0Coarse-to-Fine Pose-Reversible Guidance"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0004-7344-2333","authenticated-orcid":false,"given":"Guian","family":"Fang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9884-9923","authenticated-orcid":false,"given":"Wenbiao","family":"Yan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5835-8545","authenticated-orcid":false,"given":"Yuanfan","family":"Guo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1559-657X","authenticated-orcid":false,"given":"Jianhua","family":"Han","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6787-5499","authenticated-orcid":false,"given":"Zutao","family":"Jiang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3645-8972","authenticated-orcid":false,"given":"Hang","family":"Xu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8941-2295","authenticated-orcid":false,"given":"Shengcai","family":"Liao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0989-7446","authenticated-orcid":false,"given":"Xiaodan","family":"Liang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,23]]},"reference":[{"key":"12_CR1","doi-asserted-by":"crossref","unstructured":"Andriluka, M., et al.: PoseTrack: a benchmark for human pose estimation and tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 5167\u20135176 (2018)","DOI":"10.1109\/CVPR.2018.00542"},{"key":"12_CR2","unstructured":"Balaji, Y., et\u00a0al.: eDiffI: text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324 (2022)"},{"key":"12_CR3","unstructured":"Brown, T., et\u00a0al.: Language models are few-shot learners. In: Advances in Neural Information Processing Systems, pp. 1877\u20131901 (2020)"},{"key":"12_CR4","unstructured":"Byeon, M., Park, B., Kim, H., Lee, S., Baek, W., Kim, S.: COYO-700M: image-text pair dataset. https:\/\/github.com\/kakaobrain\/coyo-dataset (2022)"},{"key":"12_CR5","doi-asserted-by":"crossref","unstructured":"Cao, Z., Hidalgo Martinez, G., Simon, T., Wei, S., Sheikh, Y.A.: OpenPose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Intell. 43, 172\u2013186 (2019)","DOI":"10.1109\/TPAMI.2019.2929257"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Cao, Z., Simon, T., Wei, S.E., Sheikh, Y.: Realtime multi-person 2D pose estimation using part affinity fields. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 7291\u20137299 (2017)","DOI":"10.1109\/CVPR.2017.143"},{"key":"12_CR7","unstructured":"Chang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"12_CR8","doi-asserted-by":"crossref","unstructured":"Charles, J., Pfister, T., Magee, D., Hogg, D., Zisserman, A.: Personalizing human video pose estimation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3063\u20133072 (2016)","DOI":"10.1109\/CVPR.2016.334"},{"key":"12_CR9","unstructured":"Chen, J., et al.: Pixart-$$\\alpha $$: fast training of diffusion transformer for photorealistic text-to-image synthesis (2023)"},{"key":"12_CR10","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis (2021)"},{"key":"12_CR11","doi-asserted-by":"crossref","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. In: European Conference on Computer Vision, pp. 89\u2013106 (2022)","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"12_CR12","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural Inf. Process. Syst. 33, 6840\u20136851 (2020)"},{"key":"12_CR13","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. J. Mach. Learn. Res. 23, 2249\u20132281 (2022)"},{"key":"12_CR14","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"12_CR15","unstructured":"Huang, L., Chen, D., Liu, Y., Shen, Y., Zhao, D., Zhou, J.: Composer: creative and controllable image synthesis with composable conditions. arXiv preprint arXiv:2302.09778 (2023)"},{"key":"12_CR16","doi-asserted-by":"crossref","unstructured":"Iqbal, U., Milan, A., Gall, J.: PoseTrack: joint multi-person pose estimation and tracking. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2011\u20132020 (2017)","DOI":"10.1109\/CVPR.2017.495"},{"key":"12_CR17","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Wang, J., Xu, Q., Zhang, L.: Human-Art: a versatile human-centric dataset bridging natural and artificial scenes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2023)","DOI":"10.1109\/CVPR52729.2023.00067"},{"key":"12_CR18","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Zhao, C., Wang, J., Zhang, L., Xu, Q.: HumanSD: a native skeleton-guided diffusion model for human image generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 15988\u201315998 (2023)","DOI":"10.1109\/ICCV51070.2023.01465"},{"key":"12_CR19","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10124\u201310134 (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"12_CR20","doi-asserted-by":"crossref","unstructured":"Li, J., Wang, C., Zhu, H., Mao, Y., Fang, H.S., Lu, C.: CrowdPose: efficient crowded scenes pose estimation and a new benchmark. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10863\u201310872 (2019)","DOI":"10.1109\/CVPR.2019.01112"},{"key":"12_CR21","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"12_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"TY Lin","year":"2014","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"12_CR23","unstructured":"Luo, S., Tan, Y., Huang, L., Li, J., Zhao, H.: Latent consistency models: synthesizing high-resolution images with few-step inference (2023)"},{"key":"12_CR24","unstructured":"Lv, W., et al.: DETRs beat YOLOs on real-time object detection (2023)"},{"issue":"1","key":"12_CR25","first-page":"1","volume":"16","author":"P Madhu","year":"2022","unstructured":"Madhu, P., et al.: Enhancing human pose estimation in ancient vase paintings via perceptually-grounded style transfer learning. ACM J. Comput. Cult. Heritage 16(1), 1\u201317 (2022)","journal-title":"ACM J. Comput. Cult. Heritage"},{"key":"12_CR26","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-Adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-Adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Narasimhaswamy, S., Nguyen, T., Huang, M., Hoai, M.: Whose hands are these? Hand detection and hand-body association in the wild. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4889\u20134899 (2022)","DOI":"10.1109\/CVPR52688.2022.00484"},{"key":"12_CR29","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"12_CR30","unstructured":"OpenAI: Improving image generation with better captions (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"12_CR31","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis (2023)"},{"key":"12_CR32","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"12_CR33","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"12_CR34","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"12_CR35","doi-asserted-by":"crossref","unstructured":"Redmon, J., Divvala, S., Girshick, R., Farhadi, A.: You only look once: unified, real-time object detection. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 779\u2013788 (2016)","DOI":"10.1109\/CVPR.2016.91"},{"key":"12_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"12_CR37","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR38","unstructured":"Sauer, A., Karras, T., Laine, S., Geiger, A., Aila, T.: StyleGAN-T: unlocking the power of GANs for fast large-scale text-to-image synthesis. arXiv preprint arXiv:2301.09515 (2023)"},{"key":"12_CR39","unstructured":"Schuhmann, C.: LAION-aesthetics predictor V2 (2023). https:\/\/laion.ai\/blog\/laion-aesthetics\/"},{"key":"12_CR40","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: LAION-5B: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"12_CR41","unstructured":"Segmind: Announcing SSD-1B: a leap in efficient T2I generation (2023). https:\/\/blog.segmind.com\/introducing-segmind-ssd-1b\/"},{"key":"12_CR42","unstructured":"Shonenkov, A., Konstantinov, M., Bakshandaeva, D., Schuhmann, C., Ivanova, K., Klokova, N.: DeepFloyd IF: a powerful text-to-image model that can smartly integrate text into images (2023). https:\/\/www.deepfloyd.ai\/deepfloyd-if"},{"key":"12_CR43","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"12_CR44","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"12_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"12_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, P., Yang, L., Lai, J.H., Xie, X.: Exploring dual-task correlation for pose guided person image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7713\u20137722 (2022)","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"12_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, S.H., et al.: Pose2Seg: detection free human instance segmentation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 889\u2013898 (2019)","DOI":"10.1109\/CVPR.2019.00098"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73411-3_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,22]],"date-time":"2024-11-22T21:25:29Z","timestamp":1732310729000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73411-3_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,23]]},"ISBN":["9783031734106","9783031734113"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73411-3_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,23]]},"assertion":[{"value":"23 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}