{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:12:05Z","timestamp":1777655525030,"version":"3.51.4"},"publisher-location":"Cham","reference-count":45,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730092","type":"print"},{"value":"9783031730108","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,10]],"date-time":"2024-11-10T00:00:00Z","timestamp":1731196800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73010-8_7","type":"book-chapter","created":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T13:12:26Z","timestamp":1731157946000},"page":"108-124","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":9,"title":["Deep Reward Supervisions for\u00a0Tuning Text-to-Image Diffusion Models"],"prefix":"10.1007","author":[{"given":"Xiaoshi","family":"Wu","sequence":"first","affiliation":[]},{"given":"Yiming","family":"Hao","sequence":"additional","affiliation":[]},{"given":"Manyuan","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Keqiang","family":"Sun","sequence":"additional","affiliation":[]},{"given":"Zhaoyang","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Guanglu","family":"Song","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Hongsheng","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,10]]},"reference":[{"key":"7_CR1","unstructured":"Bai, Y., et\u00a0al.: Training a helpful and harmless assistant with reinforcement learning from human feedback. arXiv preprint arXiv:2204.05862 (2022)"},{"key":"7_CR2","unstructured":"Black, K., Janner, M., Du, Y., Kostrikov, I., Levine, S.: Training diffusion models with reinforcement learning. arXiv preprint arXiv:2305.13301 (2023)"},{"key":"7_CR3","unstructured":"Clark, K., Vicol, P., Swersky, K., Fleet, D.J.: Directly fine-tuning diffusion models on differentiable rewards. arXiv preprint arXiv:2309.17400 (2023)"},{"key":"7_CR4","unstructured":"Fan, Y., et al.: DPOK: reinforcement learning for fine-tuning text-to-image diffusion models. arXiv preprint arXiv:2305.16381 (2023)"},{"issue":"11","key":"7_CR5","doi-asserted-by":"publisher","first-page":"139","DOI":"10.1145\/3422622","volume":"63","author":"I Goodfellow","year":"2020","unstructured":"Goodfellow, I., et al.: Generative adversarial networks. Commun. ACM 63(11), 139\u2013144 (2020)","journal-title":"Commun. ACM"},{"key":"7_CR6","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: NeurIPS (2017)"},{"key":"7_CR7","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS, vol. 33, pp. 6840\u20136851 (2020)"},{"key":"7_CR8","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)"},{"key":"7_CR9","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip (2021). https:\/\/doi.org\/10.5281\/zenodo.5143773, if you use this software, please cite it as below","DOI":"10.5281\/zenodo.5143773"},{"key":"7_CR10","doi-asserted-by":"crossref","unstructured":"Kim, G., Kwon, T., Ye, J.C.: DiffusionCLIP: text-guided diffusion models for robust image manipulation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 2426\u20132435 (2022)","DOI":"10.1109\/CVPR52688.2022.00246"},{"key":"7_CR11","unstructured":"Kingma, D.P., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"7_CR12","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-Pic: an open dataset of user preferences for text-to-image generation. arXiv preprint arXiv:2305.01569 (2023)"},{"key":"7_CR13","unstructured":"Lee, K., et al.: Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192 (2023)"},{"key":"7_CR14","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"7_CR15","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C., Zhu, J.: DPM-solver: a fast ode solver for diffusion probabilistic model sampling in around 10 steps. In: Advances in Neural Information Processing Systems, vol. 35, pp. 5775\u20135787 (2022)"},{"key":"7_CR16","unstructured":"Lu, C., Zhou, Y., Bao, F., Chen, J., Li, C., Zhu, J.: DPM-solver++: fast solver for guided sampling of diffusion probabilistic models. arXiv preprint arXiv:2211.01095 (2022)"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Murray, N., Marchesotti, L., Perronnin, F.: AVA: a large-scale database for aesthetic visual analysis. In: CVPR, pp. 2408\u20132415 (2012)","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"7_CR18","unstructured":"Ouyang, L., et al.: Training language models to follow instructions with human feedback. In: NeurIPS, vol. 35, pp. 27730\u201327744 (2022)"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Piao, J., Sun, K., Wang, Q., Lin, K.Y., Li, H.: Inverting generative adversarial renderer for face reconstruction. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15619\u201315628 (2021)","DOI":"10.1109\/CVPR46437.2021.01536"},{"key":"7_CR20","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"7_CR21","unstructured":"Prabhudesai, M., Goyal, A., Pathak, D., Fragkiadaki, K.: Aligning text-to-image diffusion models with reward backpropagation. arXiv preprint arXiv:2310.03739 (2023)"},{"key":"7_CR22","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"7_CR23","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv abs\/2204.06125 (2022)"},{"key":"7_CR24","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10674\u201310685 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"7_CR26","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K.: DreamBooth: fine tuning text-to-image diffusion models for subject-driven generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 22500\u201322510 (2023)","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"7_CR27","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. In: NeurIPS, vol. 35, pp. 36479\u201336494 (2022)"},{"key":"7_CR28","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: Advances in Neural Information Processing Systems, vol. 29 (2016)"},{"key":"7_CR29","unstructured":"Schuhmann, C.: CLIP+MLP Aesthetic Score Predictor (2022). https:\/\/github.com\/christophschuhmann\/improved-aesthetic-predictor"},{"key":"7_CR30","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: International Conference on Machine Learning, pp. 2256\u20132265. PMLR (2015)"},{"key":"7_CR31","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"7_CR32","unstructured":"Stiennon, N., et al.: Learning to summarize with human feedback. In: Advances in Neural Information Processing Systems, vol. 33, pp. 3008\u20133021 (2020)"},{"key":"7_CR33","unstructured":"Sun, K., Wu, S., Huang, Z., Zhang, N., Wang, Q., Li, H.: Controllable 3D face synthesis with conditional generative occupancy fields. In: Advances in Neural Information Processing Systems, vol. 35, pp. 16331\u201316343 (2022)"},{"key":"7_CR34","doi-asserted-by":"crossref","unstructured":"Sun, K., Wu, S., Zhang, N., Huang, Z., Wang, Q., Li, H.: CGOF++: controllable 3D face synthesis with conditional generative occupancy fields. IEEE Trans. Pattern Anal. Mach. Intell. (2023)","DOI":"10.1109\/TPAMI.2023.3328912"},{"key":"7_CR35","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"402","DOI":"10.1007\/978-3-030-58536-5_24","volume-title":"Computer Vision \u2013 ECCV 2020","author":"Z Teed","year":"2020","unstructured":"Teed, Z., Deng, J.: RAFT: recurrent all-pairs field transforms for optical flow. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12347, pp. 402\u2013419. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58536-5_24"},{"key":"7_CR36","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Ermon, S., Naik, N.: End-to-end diffusion latent optimization improves classifier guidance. arXiv preprint arXiv:2303.13703 (2023)","DOI":"10.1109\/ICCV51070.2023.00669"},{"key":"7_CR37","unstructured":"Watson, D., Chan, W., Ho, J., Norouzi, M.: Learning fast samplers for diffusion models by differentiating through sample quality. In: International Conference on Learning Representations (2021)"},{"key":"7_CR38","unstructured":"Wu, X., et al.: Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"7_CR39","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Better aligning text-to-image models with human preference. arXiv preprint arXiv:2303.14420 (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"7_CR40","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Human preference score: better aligning text-to-image models with human preference. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 2096\u20132105 (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"7_CR41","unstructured":"Xu, J., et al.: ImageReward: learning and evaluating human preferences for text-to-image generation (2023)"},{"key":"7_CR42","unstructured":"Zhang, H., et al.: DINO: DETR with improved denoising anchor boxes for end-to-end object detection. arXiv preprint arXiv:2203.03605 (2022)"},{"key":"7_CR43","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"7_CR44","unstructured":"Zhu, X., Su, W., Lu, L., Li, B., Wang, X., Dai, J.: Deformable DETR: deformable transformers for end-to-end object detection. arXiv preprint arXiv:2010.04159 (2020)"},{"key":"7_CR45","unstructured":"Ziegler, D.M., et al.: Fine-tuning language models from human preferences. arXiv preprint arXiv:1909.08593 (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73010-8_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,9]],"date-time":"2024-11-09T14:03:18Z","timestamp":1731160998000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73010-8_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,10]]},"ISBN":["9783031730092","9783031730108"],"references-count":45,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73010-8_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,10]]},"assertion":[{"value":"10 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}