{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:26:50Z","timestamp":1777656410578,"version":"3.51.4"},"publisher-location":"Cham","reference-count":61,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031919060","type":"print"},{"value":"9783031919077","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-91907-7_15","type":"book-chapter","created":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T11:45:56Z","timestamp":1748346356000},"page":"252-269","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["BootPIG: Bootstrapping Zero-Shot Personalized Image Generation Capabilities in\u00a0Pretrained Diffusion Models"],"prefix":"10.1007","author":[{"given":"Senthil","family":"Purushwalkam","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Akash","family":"Gokul","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9222-2641","authenticated-orcid":false,"given":"Shafiq","family":"Joty","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nikhil","family":"Naik","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,12]]},"reference":[{"key":"15_CR1","doi-asserted-by":"crossref","unstructured":"Arar, M., et al.: Domain-agnostic tuning-encoder for fast personalization of text-to-image models. arXiv preprint arXiv:2307.06925 (2023)","DOI":"10.1145\/3610548.3618173"},{"key":"15_CR2","unstructured":"Brown, T.B., et al.: Language models are few-shot learners. ArXiv abs\/2005.14165 (2020). https:\/\/api.semanticscholar.org\/CorpusID:218971783"},{"key":"15_CR3","unstructured":"Chang, H., et\u00a0al.: Muse: Text-to-image generation via masked generative transformers. arXiv preprint arXiv:2301.00704 (2023)"},{"key":"15_CR4","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., Freeman, W.T.: MaskGIT: masked generative image transformer. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11315\u201311325 (2022)","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"15_CR5","unstructured":"Chen, H., et al.: DisenBooth: identity-preserving disentangled tuning for subject-driven text-to-image generation. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: 2009 IEEE Conference on Computer Vision and Pattern Recognition, pp. 248\u2013255. IEEE (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"15_CR7","first-page":"16890","volume":"35","author":"M Ding","year":"2022","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: CogView2: faster and better text-to-image generation via hierarchical transformers. Adv. Neural. Inf. Process. Syst. 35, 16890\u201316902 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR8","unstructured":"Gal, R., et al.: An image is worth one word: Personalizing text-to-image generation using textual inversion. arXiv preprint arXiv:2208.01618 (2022)"},{"issue":"4","key":"15_CR9","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592133","volume":"42","author":"R Gal","year":"2023","unstructured":"Gal, R., Arar, M., Atzmon, Y., Bermano, A.H., Chechik, G., Cohen-Or, D.: Encoder-based domain tuning for fast personalization of text-to-image models. ACM Trans. Graph. (TOG) 42(4), 1\u201313 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"issue":"11","key":"15_CR10","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"15_CR11","unstructured":"Gu, Y., et\u00a0al.: Mix-of-show: Decentralized low-rank adaptation for multi-concept customization of diffusion models. arXiv preprint arXiv:2305.18292 (2023)"},{"key":"15_CR12","unstructured":"Hao, S., Han, K., Zhao, S., Wong, K.Y.K.: ViCo: Detail-preserving visual condition for personalized text-to-image generation. arXiv preprint arXiv:2306.00971 (2023)"},{"key":"15_CR13","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"15_CR14","unstructured":"Ho, J.: Classifier-free diffusion guidance. ArXiv abs\/2207.12598 (2022). https:\/\/api.semanticscholar.org\/CorpusID:249145348"},{"key":"15_CR15","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR16","unstructured":"Hu, M., et al.: Cocktail: mixing multi-modality control for text-conditional image generation. In: Thirty-seventh Conference on Neural Information Processing Systems (2023)"},{"key":"15_CR17","unstructured":"Hua, M., Liu, J., Ding, F., Liu, W., Wu, J., He, Q.: DreamTuner: Single image is enough for subject-driven generation. ArXiv abs\/2312.13691 (2023). https:\/\/api.semanticscholar.org\/CorpusID:266436022"},{"key":"15_CR18","unstructured":"Jia, X., et al.: Taming encoder for zero fine-tuning image customization with text-to-image diffusion models. arXiv preprint arXiv:2304.02642 (2023)"},{"key":"15_CR19","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10124\u201310134 (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"15_CR20","unstructured":"Kim, S., Lee, J., Hong, K., Kim, D., Ahn, N.: DiffBlender: Scalable and composable multimodal text-to-image diffusion models. arXiv preprint arXiv:2305.15194 (2023)"},{"key":"15_CR21","unstructured":"Kirillov, A., et al.: Segment anything. ArXiv abs\/2304.02643 (2023). https:\/\/api.semanticscholar.org\/CorpusID:257952310"},{"key":"15_CR22","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1931\u20131941 (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"issue":"7","key":"15_CR23","doi-asserted-by":"publisher","first-page":"1956","DOI":"10.1007\/s11263-020-01316-z","volume":"128","author":"A Kuznetsova","year":"2020","unstructured":"Kuznetsova, A., et al.: The open images dataset V4: unified image classification, object detection, and visual relationship detection at scale. Int. J. Comput. Vision 128(7), 1956\u20131981 (2020)","journal-title":"Int. J. Comput. Vision"},{"key":"15_CR24","unstructured":"Li, D., Li, J., Hoi, S.C.: Blip-Diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. arXiv preprint arXiv:2305.14720 (2023)"},{"key":"15_CR25","unstructured":"Li, J., Li, D., Savarese, S., Hoi, S.: BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models. arXiv preprint arXiv:2301.12597 (2023)"},{"key":"15_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: GLIGEN: open-set grounded text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22511\u201322521 (2023)","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: Computer Vision\u2013ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13, pp. 740\u2013755. Springer (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"15_CR28","doi-asserted-by":"crossref","unstructured":"Ma, J., Liang, J., Chen, C., Lu, H.: Subject-diffusion: Open domain personalized text-to-image generation without test-time fine-tuning. arXiv preprint arXiv:2307.11410 (2023)","DOI":"10.1145\/3641519.3657469"},{"key":"15_CR29","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Laina, I., Rupprecht, C., Vedaldi, A.: RealFusion: 360deg reconstruction of any object from a single image. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 8446\u20138455 (2023)","DOI":"10.1109\/CVPR52729.2023.00816"},{"key":"15_CR30","unstructured":"Meng, C., et al.: SDEdit: Guided image synthesis and editing with stochastic differential equations. arXiv preprint arXiv:2108.01073 (2021)"},{"key":"15_CR31","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2I-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453 (2023)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"15_CR32","unstructured":"von Platen, P., et al.: Diffusers: State-of-the-art diffusion models. https:\/\/github.com\/huggingface\/diffusers"},{"key":"15_CR33","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: Text-to-3D using 2D diffusion. arXiv preprint arXiv:2209.14988 (2022)"},{"key":"15_CR34","unstructured":"Purushwalkam, S., Naik, N.: ConRad: Image constrained radiance fields for 3d generation from a single image. arXiv preprint arXiv:2311.05230 (2023)"},{"key":"15_CR35","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR36","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"15_CR37","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"15_CR38","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"15_CR39","doi-asserted-by":"crossref","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Medical Image Computing and Computer-Assisted Intervention\u2013MICCAI 2015: 18th International Conference, Munich, Germany, October 5-9, 2015, Proceedings, Part III 18, pp. 234\u2013241. Springer (2015)","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"15_CR40","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"15_CR41","doi-asserted-by":"crossref","unstructured":"Ruiz, N., et al.: HyperDreamBooth: Hypernetworks for fast personalization of text-to-image models. arXiv preprint arXiv:2307.06949 (2023)","DOI":"10.1109\/CVPR52733.2024.00624"},{"key":"15_CR42","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR43","doi-asserted-by":"crossref","unstructured":"Shi, J., Xiong, W., Lin, Z., Jung, H.J.: InstantBooth: Personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)","DOI":"10.1109\/CVPR52733.2024.00816"},{"key":"15_CR44","unstructured":"Smith, J.S., et al.: Continual diffusion: Continual customization of text-to-image diffusion with C-LoRA. arXiv preprint arXiv:2304.06027 (2023)"},{"key":"15_CR45","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"15_CR46","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"15_CR47","unstructured":"Vaswani, A., et al.: Attention is all you need. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"key":"15_CR48","unstructured":"Voynov, A., Chu, Q., Cohen-Or, D., Aberman, K.: $$ p+ $$: Extended textual conditioning in text-to-image generation. arXiv preprint arXiv:2303.09522 (2023)"},{"key":"15_CR49","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Naik, N.: EDICT: exact diffusion inversion via coupled transformations. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22532\u201322541 (2023)","DOI":"10.1109\/CVPR52729.2023.02158"},{"key":"15_CR50","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: ELITE: Encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"15_CR51","doi-asserted-by":"crossref","unstructured":"Xie, S., Zhang, Z., Lin, Z., Hinz, T., Zhang, K.: SmartBrush: text and shape guided object inpainting with diffusion model. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22428\u201322437 (2023)","DOI":"10.1109\/CVPR52729.2023.02148"},{"key":"15_CR52","doi-asserted-by":"crossref","unstructured":"Xu, D., Jiang, Y., Wang, P., Fan, Z., Wang, Y., Wang, Z.: NeuralLift-360: lifting an in-the-wild 2D photo to a 3D object with 360deg views. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4479\u20134489 (2023)","DOI":"10.1109\/CVPR52729.2023.00435"},{"key":"15_CR53","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W.: IP-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arXiv:2308.06721 (2023)"},{"key":"15_CR54","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.107892(3), \u00a05 (2022)"},{"key":"15_CR55","unstructured":"Yu, L., et\u00a0al.: Scaling autoregressive multi-modal models: Pretraining and instruction tuning. arXiv preprint arXiv:2309.02591 (2023)"},{"key":"15_CR56","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"15_CR57","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Inversion-based style transfer with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10146\u201310156 (2023)","DOI":"10.1109\/CVPR52729.2023.00978"},{"key":"15_CR58","unstructured":"Zhang, Y., et al.: SSR-encoder: Encoding selective subject representation for subject-driven generation. ArXiv abs\/2312.16272 (2023), https:\/\/api.semanticscholar.org\/CorpusID:266573690"},{"key":"15_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: SSR-encoder: Encoding selective subject representation for subject-driven generation. arXiv preprint arXiv:2312.16272 (2023)","DOI":"10.1109\/CVPR52733.2024.00771"},{"key":"15_CR60","unstructured":"Zhao, S., et al.: Uni-Controlnet: All-in-one control to text-to-image diffusion models. arXiv preprint arXiv:2305.16322 (2023)"},{"key":"15_CR61","unstructured":"Zhou, Y., Zhang, R., Sun, T., Xu, J.: Enhancing detail preservation for customized text-to-image generation: A regularization-free approach. ArXiv abs\/2305.13579 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258841022"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-91907-7_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T16:11:19Z","timestamp":1757175079000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-91907-7_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9783031919060","9783031919077"],"references-count":61,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-91907-7_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"12 May 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}