{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:12:44Z","timestamp":1778080364904,"version":"3.51.4"},"publisher-location":"Cham","reference-count":54,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726972","type":"print"},{"value":"9783031726989","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,26]],"date-time":"2024-10-26T00:00:00Z","timestamp":1729900800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72698-9_11","type":"book-chapter","created":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T04:45:57Z","timestamp":1729831557000},"page":"182-198","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":8,"title":["Enhancing Diffusion Models with\u00a0Text-Encoder Reinforcement Learning"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-6137-5162","authenticated-orcid":false,"given":"Chaofeng","family":"Chen","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2998-9817","authenticated-orcid":false,"given":"Annan","family":"Wang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8642-8101","authenticated-orcid":false,"given":"Haoning","family":"Wu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2238-2420","authenticated-orcid":false,"given":"Liang","family":"Liao","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5026-8820","authenticated-orcid":false,"given":"Wenxiu","family":"Sun","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2942-267X","authenticated-orcid":false,"given":"Qiong","family":"Yan","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9866-1947","authenticated-orcid":false,"given":"Weisi","family":"Lin","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,26]]},"reference":[{"key":"11_CR1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2021.108200","author":"MM Alam","year":"2021","unstructured":"Alam, M.M., Islam, M.T., Rahman, S.M.M.: Unified learning approach for egocentric hand gesture recognition and fingertip detection. Pattern Recogn. (2021). https:\/\/doi.org\/10.1016\/j.patcog.2021.108200","journal-title":"Pattern Recogn."},{"key":"11_CR2","unstructured":"Black, K., Janner, M., Du, Y., Kostrikov, I., Levine, S.: Training diffusion models with reinforcement learning. Int. Conf. Learn, Represent (2024)"},{"key":"11_CR3","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. In: International Conference on Learning Representations (2019)"},{"key":"11_CR4","unstructured":"Chen, C., Mo, J.: IQA-PyTorch: PyTorch toolbox for image quality assessment. Available https:\/\/github.com\/chaofengc\/IQA-PyTorch (2022)"},{"key":"11_CR5","doi-asserted-by":"crossref","unstructured":"Chen, C., et al.: Topiq: a top-down approach from semantics to distortions for image quality assessment. IEEE Trans. Image Process. (2024)","DOI":"10.1109\/TIP.2024.3378466"},{"key":"11_CR6","unstructured":"Christoph, S., Romain, B.: Laion-aesthetics (2022)"},{"key":"11_CR7","unstructured":"Clark, K., Vicol, P., Swersky, K., Fleet, D.J.: Directly fine-tuning diffusion models on differentiable rewards. In: International Conference on Learning Representations (2024)"},{"key":"11_CR8","unstructured":"Dai, X., et\u00a0al.: Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807 (2023)"},{"key":"11_CR9","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: Adv. Neural Inform. Process. Syst. (2021)"},{"key":"11_CR10","unstructured":"Dong, H., et al.: Raft: reward ranked finetuning for generative foundation model alignment. arXiv preprint arXiv:2304.06767 (2023)"},{"key":"11_CR11","unstructured":"Fan, Y., et al.: DPOK: reinforcement learning for fine-tuning text-to-image diffusion models. Adv. Neural Inform. Process, Syst (2024)"},{"key":"11_CR12","unstructured":"Gal, R., et al.: An image is worth one word: personalizing text-to-image generation using textual inversion. In: International Conference on Learning Representations (2023)"},{"issue":"11","key":"11_CR13","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"11_CR14","unstructured":"Guo, Y., et al.: AnimateDiff: animate your personalized text-to-image diffusion models without specific tuning. In: International Conference on Learning Representations (2024)"},{"key":"11_CR15","unstructured":"Hao, Y., Chi, Z., Dong, L., Wei, F.: Optimizing prompts for text-to-image generation. In: Advances in Neural Information Processing Systems (2023)"},{"key":"11_CR16","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: Advances in Neural Information Processing Systems (2020)"},{"key":"11_CR17","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"11_CR18","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. In: International Conference on Learning Representations (2022)"},{"key":"11_CR19","doi-asserted-by":"crossref","unstructured":"Hu, Y., Liu, B., Kasai, J., Wang, Y., Ostendorf, M., Krishna, R., Smith, N.A.: Tifa: accurate and interpretable text-to-image faithfulness evaluation with question answering. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01866"},{"key":"11_CR20","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 4401\u20134410 (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"11_CR21","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2video-zero: Text-to-image diffusion models are zero-shot video generators. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"11_CR22","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. In: Advances in Neural Information Processing Systems(2024)"},{"key":"11_CR23","unstructured":"Lee, K., et al.: Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192 (2023)"},{"key":"11_CR24","doi-asserted-by":"publisher","unstructured":"Li, C., et al.: Agiqa-3k: An open database for AI-generated image quality assessment. IEEE Trans. Circuit Syst. Video Technol. 1\u20131 (2023).https:\/\/doi.org\/10.1109\/TCSVT.2023.3319020","DOI":"10.1109\/TCSVT.2023.3319020"},{"key":"11_CR25","doi-asserted-by":"crossref","unstructured":"Li, X., Hou, X., Loy, C.C.: When stylegan meets stable diffusion: a $$\\cal{W} _+$$ adapter for personalized image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition(2024)","DOI":"10.1109\/CVPR52733.2024.00213"},{"key":"11_CR26","doi-asserted-by":"crossref","unstructured":"Li, Y., et al.: Textcraftor: your text encoder can be image quality controller. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 7985\u20137995 (2024)","DOI":"10.1109\/CVPR52733.2024.00763"},{"key":"11_CR27","doi-asserted-by":"crossref","unstructured":"Liu, R., et al.: Character-aware models improve visual text rendering. arXiv preprint arXiv:2212.10562 (2022)","DOI":"10.18653\/v1\/2023.acl-long.900"},{"key":"11_CR28","doi-asserted-by":"crossref","unstructured":"Liu, R., Wu, R., Van\u00a0Hoorick, B., Tokmakov, P., Zakharov, S., Vondrick, C.: Zero-1-to-3: Zero-shot one image to 3D object. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9298\u20139309 (2023)","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"11_CR29","doi-asserted-by":"crossref","unstructured":"Mou, C., et al.: T2i-adapter: learning adapters to dig out more controllable ability for text-to-image diffusion models. AAAI (2024)","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"11_CR30","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: Advances in Neural Information Processing Systems (2022)"},{"key":"11_CR31","unstructured":"Poole, B., Jain, A., Barron, J.T., Mildenhall, B.: DreamFusion: text-to-3D using 2D diffusion. In: International Conference on Learning Representations (2023)"},{"key":"11_CR32","unstructured":"Prabhudesai, M., Goyal, A., Pathak, D., Fragkiadaki, K.: Aligning text-to-image diffusion models with reward backpropagation. arXiv preprint arXiv:2310.03739 (2023)"},{"key":"11_CR33","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proc. In: International Conference on Machine Learning, pp. 8748\u20138763 (2021)"},{"key":"11_CR34","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"11_CR35","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: International Conference On Machine Learning (2021)"},{"key":"11_CR36","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"11_CR37","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"11_CR38","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems (2022)"},{"key":"11_CR39","unstructured":"Schuhmann, C., et\u00a0al.: LAION-5B: an open large-scale dataset for training next generation image-text models. arXiv preprint arXiv:2210.08402 (2022)"},{"key":"11_CR40","unstructured":"Schuhmann, C., et al.: LAION-400M: Open dataset of clip-filtered 400 million image-text pairs. arXiv preprint arXiv:2111.02114 (2021)"},{"key":"11_CR41","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"11_CR42","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning (2015)"},{"key":"11_CR43","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: International Conference on Learning Representations (2021)"},{"key":"11_CR44","first-page":"1415","volume":"34","author":"Y Song","year":"2021","unstructured":"Song, Y., Durkan, C., Murray, I., Ermon, S.: Maximum likelihood training of score-based diffusion models. Adv. Neural Inform. Process. Syst. 34, 1415\u20131428 (2021)","journal-title":"Adv. Neural Inform. Process. Syst."},{"key":"11_CR45","unstructured":"Song, Y., Ermon, S.: Improved techniques for training score-based generative models. In: Advances in Neural Information Processing Systems (2020)"},{"key":"11_CR46","volume-title":"Going the Extra Mile in Face Image Quality Assessment: A Novel Database and Model","author":"S Su","year":"2023","unstructured":"Su, S., et al.: Going the Extra Mile in Face Image Quality Assessment: A Novel Database and Model. IEEE Trans, Multimedia (2023)"},{"key":"11_CR47","doi-asserted-by":"crossref","unstructured":"Wang, Z.J., Montoya, E., Munechika, D., Yang, H., Hoover, B., Chau, D.H.: DiffusionDB: a large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:2210.14896 (2022)","DOI":"10.18653\/v1\/2023.acl-long.51"},{"key":"11_CR48","unstructured":"Witteveen, S., Andrews, M.: Investigating prompt engineering in diffusion models. arXiv preprint arXiv:2211.15462 (2022)"},{"key":"11_CR49","unstructured":"Wu, H., et al.: Q-bench: a benchmark for general-purpose foundation models on low-level vision. In: International Conference on Learning Representations (2024)"},{"key":"11_CR50","doi-asserted-by":"crossref","unstructured":"Wu, H., et al.: Q-instruct: improving low-level visual abilities for multi-modality foundation models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition (2024)","DOI":"10.1109\/CVPR52733.2024.02408"},{"key":"11_CR51","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7623\u20137633 (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"11_CR52","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Better aligning text-to-image models with human preference. Int. Conf. Comput, Vis (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"11_CR53","unstructured":"Xu, J., et al.: Imagereward: learning and evaluating human preferences for text-to-image generation. In: Advances in Neural Information Processing Systems (2023)"},{"key":"11_CR54","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision (2023)","DOI":"10.1109\/ICCV51070.2023.00355"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72698-9_11","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,25]],"date-time":"2024-10-25T04:50:32Z","timestamp":1729831832000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72698-9_11"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,26]]},"ISBN":["9783031726972","9783031726989"],"references-count":54,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72698-9_11","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,26]]},"assertion":[{"value":"26 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}