{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T23:01:38Z","timestamp":1772838098230,"version":"3.50.1"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730382","type":"print"},{"value":"9783031730399","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73039-9_8","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T14:57:07Z","timestamp":1730300227000},"page":"126-142","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["BeyondScene: Higher-Resolution Human-Centric Scene Generation with Pretrained Diffusion"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6570-236X","authenticated-orcid":false,"given":"Gwanghyun","family":"Kim","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0008-7461-9750","authenticated-orcid":false,"given":"Hayeon","family":"Kim","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0001-9053-2224","authenticated-orcid":false,"given":"Hoigi","family":"Seo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2486-2783","authenticated-orcid":false,"given":"Dong Un","family":"Kang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8739-8960","authenticated-orcid":false,"given":"Se Young","family":"Chun","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"8_CR1","unstructured":"Language segment anything. https:\/\/github.com\/luca-medeiros\/lang-segment-anything"},{"key":"8_CR2","unstructured":"Midjourney. https:\/\/www.midjourney.com"},{"key":"8_CR3","unstructured":"SDXL-ControlNet: OpenPose (V2). https:\/\/huggingface.co\/thibaud\/controlnet-openpose-sdxl-1.0"},{"key":"8_CR4","unstructured":"SDXL inpainting 0.1. https:\/\/huggingface.co\/diffusers\/stable-diffusion-xl-1.0-inpainting-0.1"},{"key":"8_CR5","unstructured":"Achiam, J., et\u00a0al.: GPT-4 technical report. arXiv preprint arXiv:2303.08774 (2023)"},{"key":"8_CR6","unstructured":"Bar-Tal, O., Yariv, L., Lipman, Y., Dekel, T.: Multidiffusion: fusing diffusion paths for controlled image generation. arXiv:2302.08113 (2023)"},{"key":"8_CR7","unstructured":"Chen, T.: On the importance of noise scheduling for diffusion models. arXiv preprint arXiv:2301.10972 (2023)"},{"key":"8_CR8","unstructured":"Cheong, S.Y., Mustafa, A., Gilbert, A.: KPE: keypoint pose encoding for transformer-based image generation. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"8_CR9","unstructured":"Ding, Z., Zhang, M., Wu, J., Tu, Z.: Patched denoising diffusion models for high-resolution image synthesis. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"8_CR10","unstructured":"He, Y., et al.: ScaleCrafter: tuning-free higher-resolution visual generation with diffusion models. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"8_CR11","unstructured":"Hoogeboom, E., Heek, J., Salimans, T.: Simple diffusion: end-to-end diffusion for high resolution images. arXiv preprint arXiv:2301.11093 (2023)"},{"key":"8_CR12","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Zhao, C., Wang, J., Zhang, L., Xu, Q.: HumanSD: a native skeleton-guided diffusion model for human image generation. arXiv preprint arXiv:2304.04269 (2023)","DOI":"10.1109\/ICCV51070.2023.01465"},{"key":"8_CR13","doi-asserted-by":"crossref","unstructured":"Ku, M., Jiang, D., Wei, C., Yue, X., Chen, W.: VIEScore: towards explainable metrics for conditional image synthesis evaluation. arXiv preprint arXiv:2312.14867 (2023)","DOI":"10.18653\/v1\/2024.acl-long.663"},{"key":"8_CR14","unstructured":"Lee, Y., Kim, K., Kim, H., Sung, M.: SyncDiffusion: coherent montage via synchronized joint diffusions. arXiv:2306.05178 (2023)"},{"key":"8_CR15","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. arXiv preprint arXiv:2301.07093 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"8_CR16","doi-asserted-by":"publisher","first-page":"8589","DOI":"10.1109\/TNNLS.2022.3151631","volume":"34","author":"D Liu","year":"2022","unstructured":"Liu, D., Wu, L., Zheng, F., Liu, L., Wang, M.: Verbal-Person nets: pose-guided multi-granularity language-to-person generation. IEEE Trans. Neural Netw. Learn. Syst. 34, 8589\u20138601 (2022)","journal-title":"IEEE Trans. Neural Netw. Learn. Syst."},{"key":"8_CR17","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"8_CR18","unstructured":"Liu, S., et\u00a0al.: LLaVA-plus: learning to use tools for creating multimodal agents. arXiv preprint arXiv:2311.05437 (2023)"},{"key":"8_CR19","doi-asserted-by":"crossref","unstructured":"Lv, Z., et al.: Learning semantic person image generation by region-adaptive normalization. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 10806\u201310815 (2021)","DOI":"10.1109\/CVPR46437.2021.01066"},{"key":"8_CR20","unstructured":"Ma, L., Jia, X., Sun, Q., Schiele, B., Tuytelaars, T., Van\u00a0Gool, L.: Pose guided person image generation, vol.\u00a030 (2017)"},{"key":"8_CR21","doi-asserted-by":"crossref","unstructured":"Ma, T., Peng, B., Wang, W., Dong, J.: MUST-GAN: multi-level statistics transfer for self-driven person image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13622\u201313631 (2021)","DOI":"10.1109\/CVPR46437.2021.01341"},{"key":"8_CR22","doi-asserted-by":"crossref","unstructured":"Men, Y., Mao, Y., Jiang, Y., Ma, W.Y., Lian, Z.: Controllable person image synthesis with attribute-decomposed GAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 5084\u20135093 (2020)","DOI":"10.1109\/CVPR42600.2020.00513"},{"key":"8_CR23","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-Adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"8_CR24","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"8_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML), pp. 8748\u20138763. PMLR (2021)"},{"key":"8_CR26","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"8_CR27","doi-asserted-by":"crossref","unstructured":"Ren, Y., Fan, X., Li, G., Liu, S., Li, T.H.: Neural texture extraction and distribution for controllable person image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 13535\u201313544 (2022)","DOI":"10.1109\/CVPR52688.2022.01317"},{"key":"8_CR28","doi-asserted-by":"crossref","unstructured":"Ren, Y., Yu, X., Chen, J., Li, T.H., Li, G.: Deep image spatial transformation for person image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7690\u20137699 (2020)","DOI":"10.1109\/CVPR42600.2020.00771"},{"key":"8_CR29","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR30","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"8_CR31","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"161","DOI":"10.1007\/978-3-031-19839-7_10","volume-title":"ECCV 2022","author":"P Roy","year":"2022","unstructured":"Roy, P., Ghosh, S., Bhattacharya, S., Pal, U., Blumenstein, M.: TIPS: text-induced pose synthesis. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13698, pp. 161\u2013178. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19839-7_10"},{"key":"8_CR32","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"8_CR33","unstructured":"Teng, J., et al.: Relay diffusion: unifying diffusion process across resolutions for image synthesis. arXiv preprint arXiv:2309.03350 (2023)"},{"key":"8_CR34","doi-asserted-by":"publisher","first-page":"5400","DOI":"10.1109\/TMM.2022.3192729","volume":"25","author":"L Wang","year":"2022","unstructured":"Wang, L., et al.: What happens in crowd scenes: a new dataset about crowd scenes for image captioning. IEEE Trans. Multimedia 25, 5400\u20135412 (2022)","journal-title":"IEEE Trans. Multimedia"},{"key":"8_CR35","unstructured":"Wu, J., et al.: GRiT: a generative region-to-text transformer for object understanding. arXiv:2212.00280 (2022)"},{"key":"8_CR36","doi-asserted-by":"crossref","unstructured":"Xie, E., et al.: DiffFit: unlocking transferability of large diffusion models via simple parameter-efficient fine-tuning. arXiv preprint arXiv:2304.06648 (2023)","DOI":"10.1109\/ICCV51070.2023.00390"},{"issue":"10","key":"8_CR37","doi-asserted-by":"publisher","first-page":"6486","DOI":"10.1109\/TPAMI.2021.3085339","volume":"44","author":"X Xu","year":"2021","unstructured":"Xu, X., Chen, Y.C., Tao, X., Jia, J.: Text-guided human image manipulation via image-text shared space. IEEE Trans. Pattern Anal. Mach. Intell. (PAMI) 44(10), 6486\u20136500 (2021)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell. (PAMI)"},{"key":"8_CR38","unstructured":"Xu, Y., Zhang, J., Zhang, Q., Tao, D.: ViTPose: simple vision transformer baselines for human pose estimation. In: NeurIPS, vol. 35, pp. 38571\u201338584 (2022)"},{"key":"8_CR39","doi-asserted-by":"crossref","unstructured":"Yang, F., Lin, G.: CT-Net: complementary transferring network for garment transfer with arbitrary geometric changes. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 9899\u20139908 (2021)","DOI":"10.1109\/CVPR46437.2021.00977"},{"key":"8_CR40","unstructured":"Yang, L., Yu, Z., Meng, C., Xu, M., Ermon, S., Cui, B.: Mastering text-to-image diffusion: recaptioning, planning, and generating with multimodal LLMs. arXiv preprint arXiv:2401.11708 (2024)"},{"key":"8_CR41","doi-asserted-by":"publisher","first-page":"2422","DOI":"10.1109\/TIP.2021.3052364","volume":"30","author":"L Yang","year":"2021","unstructured":"Yang, L., et al.: Towards fine-grained human pose transfer with detail replenishing network. IEEE Trans. Image Process. 30, 2422\u20132435 (2021)","journal-title":"IEEE Trans. Image Process."},{"key":"8_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, B., et al.: StylesWin: transformer-based GAN for high-resolution image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11304\u201311314 (2022)","DOI":"10.1109\/CVPR52688.2022.01102"},{"key":"8_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, J., Li, K., Lai, Y.K., Yang, J.: PISE: person image synthesis and editing with decoupled GAN. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7982\u20137990 (2021)","DOI":"10.1109\/CVPR46437.2021.00789"},{"key":"8_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"8_CR45","doi-asserted-by":"crossref","unstructured":"Zhang, P., Yang, L., Lai, J.H., Xie, X.: Exploring dual-task correlation for pose guided person image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 7713\u20137722 (2022)","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"8_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, Q., Song, J., Huang, X., Chen, Y., Liu, M.Y.: DiffCollage: parallel generation of large content with diffusion models. arXiv:2303.17076 (2023)","DOI":"10.1109\/CVPR52729.2023.00982"},{"key":"8_CR47","doi-asserted-by":"crossref","unstructured":"Zheng, Q., et al.: Any-size-diffusion: toward efficient text-driven synthesis for any-size HD images. arXiv preprint arXiv:2308.16582 (2023)","DOI":"10.1609\/aaai.v38i7.28589"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73039-9_8","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T15:19:35Z","timestamp":1730301575000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73039-9_8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730382","9783031730399"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73039-9_8","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}