{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,10]],"date-time":"2025-10-10T13:05:07Z","timestamp":1760101507564,"version":"3.40.3"},"publisher-location":"Cham","reference-count":47,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031733826"},{"type":"electronic","value":"9783031733833"}],"license":[{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,3]],"date-time":"2024-11-03T00:00:00Z","timestamp":1730592000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73383-3_23","type":"book-chapter","created":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T12:04:00Z","timestamp":1730549040000},"page":"394-410","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Powerful and\u00a0Flexible: Personalized Text-to-Image Generation via\u00a0Reinforcement Learning"],"prefix":"10.1007","author":[{"given":"Fanyue","family":"Wei","sequence":"first","affiliation":[]},{"given":"Wei","family":"Zeng","sequence":"additional","affiliation":[]},{"given":"Zhenyang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Dawei","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Lixin","family":"Duan","sequence":"additional","affiliation":[]},{"given":"Wen","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,3]]},"reference":[{"issue":"6","key":"23_CR1","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3618322","volume":"42","author":"Y Alaluf","year":"2023","unstructured":"Alaluf, Y., Richardson, E., Metzer, G., Cohen-Or, D.: A neural space-time representation for text-to-image personalization. ACM Trans. Graph. (TOG) 42(6), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Arar, M., et al.: Domain-agnostic tuning-encoder for fast personalization of text-to-image models. In: SIGGRAPH Asia 2023 Conference Papers, pp. 1\u201310 (2023)","DOI":"10.1145\/3610548.3618173"},{"key":"23_CR3","unstructured":"Arar, M., et al.: PALP: prompt aligned personalization of text-to-image models. arXiv preprint arXiv:2401.06105 (2024)"},{"key":"23_CR4","unstructured":"Betker, J., et al.: Dall-e 3 (2023). https:\/\/cdn.openai.com\/papers\/dall-e-3.pdf"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"23_CR6","unstructured":"Chen, W., et al.: Subject-driven text-to-image generation via apprenticeship learning. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR7","unstructured":"Christiano, P.F., Leike, J., Brown, T., Martic, M., Legg, S., Amodei, D.: Deep reinforcement learning from human preferences. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"23_CR8","unstructured":"Clark, K., Vicol, P., Swersky, K., Fleet, D.J.: Directly fine-tuning diffusion models on differentiable rewards. In: The Twelfth International Conference on Learning Representations (2023)"},{"key":"23_CR9","first-page":"8780","volume":"34","author":"P Dhariwal","year":"2021","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. Adv. Neural. Inf. Process. Syst. 34, 8780\u20138794 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR10","unstructured":"Fan, Y., et al.: Reinforcement learning for fine-tuning text-to-image diffusion models. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR11","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. In: The Eleventh International Conference on Learning Representations (2022)"},{"issue":"4","key":"23_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592133","volume":"42","author":"R Gal","year":"2023","unstructured":"Gal, R., Arar, M., Atzmon, Y., Bermano, A.H., Chechik, G., Cohen-Or, D.: Encoder-based domain tuning for fast personalization of text-to-image models. ACM Trans. Graph. (TOG) 42(4), 1\u201313 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"23_CR13","unstructured":"Hao, S., Han, K., Zhao, S., Wong, K.Y.K.: VICO: detail-preserving visual condition for personalized text-to-image generation. arXiv preprint arXiv:2306.00971 (2023)"},{"key":"23_CR14","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. Adv. Neural. Inf. Process. Syst. 33, 6840\u20136851 (2020)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR15","unstructured":"Jia, X., et al.: Taming encoder for zero fine-tuning image customization with text-to-image diffusion models. arXiv preprint arXiv:2304.02642 (2023)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Kawar, B., et al.: Imagic: text-based real image editing with diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 6007\u20136017 (2023)","DOI":"10.1109\/CVPR52729.2023.00582"},{"key":"23_CR17","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y.: Multi-concept customization of text-to-image diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 1931\u20131941 (2023)","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"23_CR19","unstructured":"Lee, K., et al.: Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192 (2023)"},{"key":"23_CR20","unstructured":"Lee, K., Kwak, S., Sohn, K., Shin, J.: Direct consistency optimization for compositional text-to-image personalization. arXiv preprint arXiv:2402.12004 (2024)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Lee, S.H., et al.: Parrot: pareto-optimal multi-reward reinforcement learning framework for text-to-image generation. arXiv preprint arXiv:2401.05675 (2024)","DOI":"10.1007\/978-3-031-72920-1_26"},{"key":"23_CR22","unstructured":"Li, D., Li, J., Hoi, S.: Blip-diffusion: pre-trained subject representation for controllable text-to-image generation and editing. In: Advances in Neural Information Processing Systems, vol. 36 (2024)"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Liang, Y., et al.: Rich human feedback for text-to-image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 19401\u201319411 (2024)","DOI":"10.1109\/CVPR52733.2024.01835"},{"key":"23_CR24","unstructured":"Lillicrap, T.P., et al.: Continuous control with deep reinforcement learning. arXiv preprint arXiv:1509.02971 (2015)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Ma, J., Liang, J., Chen, C., Lu, H.: Subject-diffusion: open domain personalized text-to-image generation without test-time fine-tuning. arXiv preprint arXiv:2307.11410 (2023)","DOI":"10.1145\/3641519.3657469"},{"key":"23_CR26","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"23_CR27","unstructured":"Nichol, A.Q., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. In: International Conference on Machine Learning, pp. 16784\u201316804. PMLR (2022)"},{"key":"23_CR28","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"23_CR29","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"23_CR30","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"23_CR31","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference on Machine Learning, pp. 8821\u20138831. PMLR (2021)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"23_CR33","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"23_CR34","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: Dreambooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Ruiz, N., et al.: Hyperdreambooth: hypernetworks for fast personalization of text-to-image models. arXiv preprint arXiv:2307.06949 (2023)","DOI":"10.1109\/CVPR52733.2024.00624"},{"key":"23_CR36","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"23_CR37","doi-asserted-by":"crossref","unstructured":"Shi, J., Xiong, W., Lin, Z., Jung, H.J.: Instantbooth: personalized text-to-image generation without test-time finetuning. arXiv preprint arXiv:2304.03411 (2023)","DOI":"10.1109\/CVPR52733.2024.00816"},{"key":"23_CR38","unstructured":"Silver, D., Lever, G., Heess, N., Degris, T., Wierstra, D., Riedmiller, M.: Deterministic policy gradient algorithms. In: International Conference on Machine Learning, pp. 387\u2013395. PMLR (2014)"},{"key":"23_CR39","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"23_CR40","doi-asserted-by":"crossref","unstructured":"Tewel, Y., Gal, R., Chechik, G., Atzmon, Y.: Key-locked rank one editing for text-to-image personalization. In: ACM SIGGRAPH 2023 Conference Proceedings, pp. 1\u201311 (2023)","DOI":"10.1145\/3588432.3591506"},{"key":"23_CR41","unstructured":"Voynov, A., Chu, Q., Cohen-Or, D., Aberman, K.: $$ p+ $$: extended textual conditioning in text-to-image generation. arXiv preprint arXiv:2303.09522 (2023)"},{"key":"23_CR42","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W.: Elite: encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848 (2023)","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"23_CR43","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Better aligning text-to-image models with human preference. arXiv preprint arXiv:2303.14420 (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"23_CR44","unstructured":"Xu, J., et al.: Imagereward: learning and evaluating human preferences for text-to-image generation. arXiv preprint arXiv:2304.05977 (2023)"},{"key":"23_CR45","unstructured":"Yu, J., et al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789, vol. 2, no. 3, p. 5 (2022)"},{"key":"23_CR46","unstructured":"Zhang, S., Xiao, S., Huang, W.: Forgedit: text guided image editing via learning and forgetting. arXiv preprint arXiv:2309.10556 (2023)"},{"key":"23_CR47","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tzeng, E., Du, Y., Kislyuk, D.: Large-scale reinforcement learning for diffusion models. arXiv preprint arXiv:2401.12244 (2024)","DOI":"10.1007\/978-3-031-73036-8_1"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73383-3_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,30]],"date-time":"2024-11-30T18:06:48Z","timestamp":1732990008000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73383-3_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,3]]},"ISBN":["9783031733826","9783031733833"],"references-count":47,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73383-3_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,3]]},"assertion":[{"value":"3 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}