{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T13:10:09Z","timestamp":1748092209042,"version":"3.41.0"},"publisher-location":"Cham","reference-count":51,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031915680","type":"print"},{"value":"9783031915697","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91569-7_17","type":"book-chapter","created":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T12:49:47Z","timestamp":1748090987000},"page":"267-285","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Flow chart depicting a process with a central node labeled \"VisionNet.\" The chart likely represents a system or model related to vision processing, with \"VisionNet\" as the focal point. The layout suggests a network or sequence of steps, though specific connections or additional nodes are not visible. : Bridging and\u00a0Harmonizing Flow chart with a central node labeled \"Visual\" in pink text. The chart likely represents a process or concept related to visual elements, but no additional nodes or connections are visible. The background is white, emphasizing the central text. and\u00a0Textual Conditioning for\u00a0 Flow chart with a central node labeled \"ConvexNet.\" The chart likely represents a process or system related to this term, with potential connections or pathways not visible in the image."],"prefix":"10.1007","author":[{"given":"Soon Yau","family":"Cheong","sequence":"first","affiliation":[]},{"given":"Armin","family":"Mustafa","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Gilbert","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"17_CR1","unstructured":"Bhunia, A.K., et al.: Person image synthesis via denoising diffusion model. In: IEEE Conference of Computer Vision and Pattern Recognition (CVPR) (2023)"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Cao, M., Wang, X., Qi, Z., Shan, Y., Qie, X., Zheng, Y.: Masactrl: tuning-free mutual self-attention control for consistent image synthesis and editing. In: Proceeding of International Computer Vision Conference (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.02062"},{"key":"17_CR3","unstructured":"Cao, Z., Hidalgo, G., Simon, T., Wei, S.E., Sheikh, Y.: Openpose: realtime multi-person 2D pose estimation using part affinity fields. IEEE Trans. Pattern Anal. Mach. Intell. (2019)"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. SIGGRAPH (2023)","DOI":"10.1145\/3592116"},{"key":"17_CR5","unstructured":"Chen, H., Zhang, Y., Wang, X., Duan, X., Zhou, Y., Zhu, W.: Disenbooth: disentangled parameter-efficient tuning for subject-driven text-to-image generation. arXiv preprint arXiv:2305.03374 (2023)"},{"key":"17_CR6","unstructured":"Chen, W., et al.: Subject-driven text-to-image generation via apprenticeship learning. arXiv preprint arXiv:2304.00186 (2023)"},{"key":"17_CR7","unstructured":"Cheong, S.Y., Mustafa, A., Gilbert, A.: KPE: keypoint pose encoding for transformer-based image generation. In: British Machine Vision Conference (BMVC) (2022)"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Cheong, S.Y., Mustafa, A., Gilbert, A.: Upgpt: universal diffusion model for person image generation, editing and pose transfer. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (ICCV) Workshops, pp. 4173\u20134182 (2023)","DOI":"10.1109\/ICCVW60793.2023.00451"},{"key":"17_CR9","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Conference on Neural Information Processing Systems (NeurIPS) (2021). https:\/\/arxiv.org\/abs\/2105.05233"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"17_CR11","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. In: ICLR (2022)"},{"key":"17_CR12","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"key":"17_CR13","unstructured":"Goodfellow, I.J., et al.: Generative adversarial networks. In: Conference on Neural Information Processing Systems (NeurIPS) (2014)"},{"key":"17_CR14","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Conference on Neural Information Processing Systems (NeurIPS) (2020). https:\/\/arxiv.org\/abs\/2006.11239"},{"key":"17_CR15","unstructured":"Houlsby, N., et al.: Parameter-efficient transfer learning for NLP. In: ICML (2019)"},{"key":"17_CR16","unstructured":"Hu, L., Gao, X., Zhang, P., Sun, K., Zhang, B., Bo, L.: Animate anyone: consistent and controllable image-to-video synthesis for character animation. arXiv preprint arXiv:2311.17117 (2023)"},{"key":"17_CR17","unstructured":"HuggingFace: openai\/clip-vit-large-patch14 (2011). https:\/\/huggingface.co\/openai\/clip-vit-large-patch14"},{"key":"17_CR18","unstructured":"Jia, X., et al.: Taming encoder for zero fine-tuning image customization with text-to-image diffusion models. arXiv preprint arXiv:2304.02642 (2023)"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Yang, S., Qiu, H., Wu, W., Loy, C.C., Liu, Z.: Text2human: text-driven controllable human image generation. In: SIGGRAPH (2022)","DOI":"10.1145\/3528223.3530104"},{"key":"17_CR20","doi-asserted-by":"crossref","unstructured":"Ju, X., Zeng, A., Zhao, C., Wang, J., Zhang, L., Xu, Q.: Humansd: a native skeleton-guided diffusion model for human image generation. In: International Conference on Computer Vision (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.01465"},{"key":"17_CR21","unstructured":"Li, Y., Keuper, M., Zhang, D., Khoreva, A.: Divide & bind your attention for improved generative semantic nursing. In: BMVC (2023)"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft coco: common objects in context. In: European Conference on Computer Vision (ECCV) (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"17_CR23","unstructured":"Liu, X., et al.: Hyperhuman: hyper-realistic human generation with latent structural diffusion. arXiv preprint: arXiv:2310.08579 (2023)"},{"key":"17_CR24","unstructured":"Ma, L., Jia, X., Sun, Q., Schiele, B., Tuytelaars, T., Gool, L.V.: Pose guided person image generation. In: Conference on Neural Information Processing Systems (NeurIPS) (2017)"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"17_CR26","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. In: Proceedings of Machine Learning Research (2021). https:\/\/arxiv.org\/pdf\/2112.10741.pdf"},{"key":"17_CR27","unstructured":"Pinkney, J.: Stable diffusion image variations (2022). https:\/\/github.com\/justinpinkney\/stable-diffusion"},{"key":"17_CR28","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning (ICML) (2021)"},{"key":"17_CR29","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv Preprint: arXiv:2204.06125 (2022)"},{"key":"17_CR30","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning (ICML) (2021)"},{"key":"17_CR31","doi-asserted-by":"crossref","unstructured":"Ren, Y., Fan, X., Li, G., Liu, S., Li, T.H.: Neural texture extraction and distribution for controllable person image synthesis. In: IEEE Conference of Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01317"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: International Conference on Medical Image Computing and Computer Assisted Interventions (MICCAI) (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"17_CR35","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint: arXiv:2205.11487 (2022)"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Shi, J., Xiong, W., Lin, Z., Jung, H.J.: Instantbooth: personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)","DOI":"10.1109\/CVPR52733.2024.00816"},{"key":"17_CR37","unstructured":"Stability.ai: Stable diffusion 2 (2023). https:\/\/github.com\/Stability-AI\/stablediffusion"},{"key":"17_CR38","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Conference on Neural Information Processing Systems (NeurIPS) (2017)"},{"key":"17_CR39","unstructured":"Wang, Q., et al.: Instantid: zero-shot identity-preserving generation in seconds. arXiv preprint arXiv:2401.07519 (2024)"},{"key":"17_CR40","doi-asserted-by":"crossref","unstructured":"Wang, T., et al.: Disco: disentangled control for referring human dance generation in real world. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52733.2024.00891"},{"key":"17_CR41","doi-asserted-by":"publisher","unstructured":"Wang, Z., Simoncelli, E., Bovik, A.: Multiscale structural similarity for image quality assessment. In: The Thrity-Seventh Asilomar Conference on Signals, Systems and Computers, 2003, vol.\u00a02, pp. 1398\u20131402 (2003). https:\/\/doi.org\/10.1109\/ACSSC.2003.1292216","DOI":"10.1109\/ACSSC.2003.1292216"},{"key":"17_CR42","unstructured":"Xu, Z., et al.: Magicanimate: temporally consistent human image animation using diffusion model. arxiv:2311.16498 (2023)"},{"key":"17_CR43","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: IP-adapter: text compatible image prompt adapter for text-to-image diffusion models. arXiv Pre-print arXiv:2308.06721 (2023)"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Zhang, J., Li, K., Lai, Y.K., Yang, J.: PISE: person image synthesis and editing with decoupled GAN. In: IEEE Conference of Computer Vision and Pattern Recognition (CVPR) (2021)","DOI":"10.1109\/CVPR46437.2021.00789"},{"key":"17_CR45","unstructured":"Zhang, K., et al.: Humandiffusion: a coarse-to-fine alignment diffusion framework for controllable text-driven person image generation. arXiv Preprint arXiv:2211.06235 (2022)"},{"key":"17_CR46","doi-asserted-by":"crossref","unstructured":"Zhang, L., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: International Computer Vision Conference (ICCV) (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"17_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, P., Yang, L., Lai, J., Xie, X.: Exploring dual-task correlation for pose guided person image generation. In: IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2022)","DOI":"10.1109\/CVPR52688.2022.00756"},{"key":"17_CR48","unstructured":"Zhao, S., et al.: Uni-controlnet: all-in-one control to text-to-image diffusion models. In: NeurIPS (2023)"},{"key":"17_CR49","doi-asserted-by":"crossref","unstructured":"Zhou, X., Yin, M., Chen, X., Sun, L., Gao, C., Li, Q.: Cross attention based style distribution for controllable person image synthesis. In: European Conference on Computer Vision (ECCV) IEEE Conference of Computer Vision and Pattern Rec (2022)","DOI":"10.1007\/978-3-031-19784-0_10"},{"key":"17_CR50","doi-asserted-by":"crossref","unstructured":"Zhu, Z., Huang, T., Shi, B., Yu, M., Wang, B., Bai, X.: Progressive pose attention transfer for person image generation. In: IEEE Conference of Computer Vision and Pattern Recognition (CVPR) (2019)","DOI":"10.1109\/CVPR.2019.00245"},{"key":"17_CR51","unstructured":"Ziwei, Luo, P., Qiu, S., Wang, X., Tang, X.L.: Deepfashion: powering robust clothes recognition and retrieval with rich annotations. In: Proceedings of IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91569-7_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,24]],"date-time":"2025-05-24T12:50:05Z","timestamp":1748091005000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91569-7_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031915680","9783031915697"],"references-count":51,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91569-7_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}