{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:19:08Z","timestamp":1777655948153,"version":"3.51.4"},"publisher-location":"Cham","reference-count":60,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031726606","type":"print"},{"value":"9783031726613","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,27]],"date-time":"2024-11-27T00:00:00Z","timestamp":1732665600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72661-3_23","type":"book-chapter","created":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T07:46:47Z","timestamp":1732607207000},"page":"399-415","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Towards Reliable Advertising Image Generation Using Human Feedback"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1386-8381","authenticated-orcid":false,"given":"Zhenbang","family":"Du","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Wei","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haohan","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yaoyu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingsen","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jian","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zheng","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingjing","family":"Lv","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xin","family":"Zhu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junsheng","family":"Jin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Junjie","family":"Shen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhangang","family":"Lin","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingping","family":"Shao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,11,27]]},"reference":[{"key":"23_CR1","unstructured":"Armandpour, M., Zheng, H., Sadeghian, A., Sadeghian, A., Zhou, M.: Re-imagine the negative prompt algorithm: transform 2D diffusion into 3D, alleviate Janus problem and beyond. arXiv preprint arXiv:2304.04968 (2023)"},{"key":"23_CR2","unstructured":"Black, K., Janner, M., Du, Y., Kostrikov, I., Levine, S.: Training diffusion models with reinforcement learning. In: ICLR (2024)"},{"key":"23_CR3","unstructured":"Casper, S., et\u00a0al.: Open problems and fundamental limitations of reinforcement learning from human feedback. arXiv preprint arXiv:2307.15217 (2023)"},{"key":"23_CR4","doi-asserted-by":"crossref","unstructured":"Chen, J., Ge, T., Jiang, G., Zhang, Z., Lian, D., Zheng, K.: Efficient optimal selection for composited advertising creatives with tree structure. In: AAAI, vol.\u00a035, pp. 3967\u20133975 (2021)","DOI":"10.1609\/aaai.v35i5.16516"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Chen, J., et al.: Automated creative optimization for e-commerce advertising. In: WWW, pp. 2304\u20132313 (2021)","DOI":"10.1145\/3442381.3449909"},{"key":"23_CR6","unstructured":"Christiano, P.F., Leike, J., Brown, T., Martic, M., Legg, S., Amodei, D.: Deep reinforcement learning from human preferences. In: NeurIPS, vol.\u00a030 (2017)"},{"key":"23_CR7","unstructured":"Clark, K., Vicol, P., Swersky, K., Fleet, D.J.: Directly fine-tuning diffusion models on differentiable rewards. In: ICLR (2024)"},{"key":"23_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: CVPR, pp. 248\u2013255 (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"23_CR9","unstructured":"Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT, pp. 4171\u20134186 (2019)"},{"key":"23_CR10","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS, vol.\u00a034, pp. 8780\u20138794 (2021)"},{"key":"23_CR11","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: transformers for image recognition at scale. In: ICLR (2021)"},{"key":"23_CR12","unstructured":"Fan, Y., Lee, K.: Optimizing DDPM sampling with shortcut fine-tuning. In: ICML, vol.\u00a0202, pp. 9623\u20139639 (2023)"},{"key":"23_CR13","unstructured":"Fan, Y., et al.: DPOK: reinforcement learning for fine-tuning text-to-image diffusion models. In: NeurIPS (2023)"},{"issue":"59","key":"23_CR14","first-page":"1","volume":"17","author":"Y Ganin","year":"2016","unstructured":"Ganin, Y., et al.: Domain-adversarial training of neural networks. JMLR 17(59), 1\u201335 (2016)","journal-title":"JMLR"},{"key":"23_CR15","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: NeurIPS, vol.\u00a027 (2014)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"23_CR17","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"key":"23_CR18","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: NeurIPS Workshop (2021)"},{"key":"23_CR19","unstructured":"Hu, E.J., et al.: Lora: low-rank adaptation of large language models. In: ICLR (2022)"},{"key":"23_CR20","unstructured":"Ibarz, B., Leike, J., Pohlen, T., Irving, G., Legg, S., Amodei, D.: Reward learning from human preferences and demonstrations in Atari. In: NeurIPS, vol.\u00a031 (2018)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Isola, P., Zhu, J.Y., Zhou, T., Efros, A.A.: Image-to-image translation with conditional adversarial networks. In: CVPR, pp. 1125\u20131134 (2017)","DOI":"10.1109\/CVPR.2017.632"},{"key":"23_CR22","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. In: NeurIPS, vol.\u00a036 (2024)"},{"key":"23_CR23","unstructured":"Ku, Y.N., Kuznetsov, M., Mishra, S., de\u00a0Juan, P.: Staging e-commerce products for online advertising using retrieval assisted image generation. arXiv preprint arXiv:2307.15326 (2023)"},{"key":"23_CR24","unstructured":"Lee, K., et al.: Aligning text-to-image models using human feedback. arXiv preprint arXiv:2302.12192 (2023)"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Lee, S.H., et\u00a0al.: Parrot: Pareto-optimal multi-reward reinforcement learning framework for text-to-image generation. arXiv preprint arXiv:2401.05675 (2024)","DOI":"10.1007\/978-3-031-72920-1_26"},{"key":"23_CR26","unstructured":"Li, Z., et al.: Planning and rendering: towards end-to-end product poster generation. arXiv preprint arXiv:2312.08822 (2023)"},{"key":"23_CR27","unstructured":"Liang, Y., et\u00a0al.: Rich human feedback for text-to-image generation. arXiv preprint arXiv:2312.10240 (2023)"},{"key":"23_CR28","unstructured":"Liu, Y., et al.: RoBERTa: a robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692 (2019)"},{"issue":"1","key":"23_CR29","doi-asserted-by":"publisher","first-page":"20","DOI":"10.1177\/02783649211050958","volume":"41","author":"DP Losey","year":"2022","unstructured":"Losey, D.P., Bajcsy, A., O\u2019Malley, M.K., Dragan, A.D.: Physical interaction as communication: learning robot objectives online from human corrections. Int. J. Robot. Res. 41(1), 20\u201344 (2022)","journal-title":"Int. J. Robot. Res."},{"key":"23_CR30","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. In: ICLR (2019)"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Lugmayr, A., Danelljan, M., Romero, A., Yu, F., Timofte, R., Gool, L.V.: Repaint: inpainting using denoising diffusion probabilistic models. In: CVPR, pp. 11451\u201311461 (2022)","DOI":"10.1109\/CVPR52688.2022.01117"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"Mishra, S., Verma, M., Zhou, Y., Thadani, K., Wang, W.: Learning to create better ads: generation and ranking approaches for ad creative refinement. In: CIKM, pp. 2653\u20132660 (2020)","DOI":"10.1145\/3340531.3412720"},{"key":"23_CR33","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. In: ICML (2021)"},{"key":"23_CR34","unstructured":"Ouyang, L., et\u00a0al.: Training language models to follow instructions with human feedback. In: NeurIPS, vol.\u00a035, pp. 27730\u201327744 (2022)"},{"key":"23_CR35","unstructured":"Prabhudesai, M., Goyal, A., Pathak, D., Fragkiadaki, K.: Aligning text-to-image diffusion models with reward backpropagation (2023)"},{"key":"23_CR36","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107404","volume":"106","author":"X Qin","year":"2020","unstructured":"Qin, X., Zhang, Z., Huang, C., Dehghan, M., Zaiane, O., Jagersand, M.: U2-net: going deeper with nested u-structure for salient object detection. Pattern Recogn. 106, 107404 (2020)","journal-title":"Pattern Recogn."},{"key":"23_CR37","unstructured":"Rafailov, R., Sharma, A., Mitchell, E., Manning, C.D., Ermon, S., Finn, C.: Direct preference optimization: your language model is secretly a reward model. In: NeurIPS (2023)"},{"key":"23_CR38","doi-asserted-by":"crossref","unstructured":"Ranftl, R., Bochkovskiy, A., Koltun, V.: Vision transformers for dense prediction. In: ICCV, pp. 12179\u201312188 (2021)","DOI":"10.1109\/ICCV48922.2021.01196"},{"key":"23_CR39","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"23_CR40","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"23_CR41","unstructured":"Schuhmann, C., et\u00a0al.: Laion-5b: an open large-scale dataset for training next generation image-text models. In: NeurIPS, vol.\u00a035, pp. 25278\u201325294 (2022)"},{"key":"23_CR42","unstructured":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., Klimov, O.: Proximal policy optimization algorithms. arXiv preprint arXiv:1707.06347 (2017)"},{"key":"23_CR43","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. In: ICLR (2021)"},{"key":"23_CR44","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., Poole, B.: Score-based generative modeling through stochastic differential equations. In: ICLR (2021)"},{"key":"23_CR45","unstructured":"Touvron, H., et\u00a0al.: Llama 2: open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)"},{"key":"23_CR46","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS, vol.\u00a030 (2017)"},{"key":"23_CR47","unstructured":"Wallace, B., et al.: Diffusion model alignment using direct preference optimization. arXiv preprint arXiv:2311.12908 (2023)"},{"key":"23_CR48","doi-asserted-by":"crossref","unstructured":"Wallace, B., Gokul, A., Ermon, S., Naik, N.V.: End-to-end diffusion latent optimization improves classifier guidance. In: ICCV, pp. 7246\u20137256 (2023)","DOI":"10.1109\/ICCV51070.2023.00669"},{"key":"23_CR49","unstructured":"Wang, H., et\u00a0al.: Generate e-commerce product background by integrating category commonality and personalized style. arXiv preprint arXiv:2312.13309 (2023)"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Wang, J., et al.: Deep high-resolution representation learning for visual recognition. IEEE Trans. Pattern Anal. Mach. Intell. 43, 3349\u20133364 (2019)","DOI":"10.1109\/TPAMI.2020.2983686"},{"key":"23_CR51","doi-asserted-by":"crossref","unstructured":"Wang, S., Liu, Q., Ge, T., Lian, D., Zhang, Z.: A hybrid bandit model with visual priors for creative ranking in display advertising. In: WWW, pp. 2324\u20132334 (2021)","DOI":"10.1145\/3442381.3449910"},{"key":"23_CR52","doi-asserted-by":"crossref","unstructured":"Wang, Z.J., Montoya, E., Munechika, D., Yang, H., Hoover, B., Chau, D.H.: DiffusionDB: a large-scale prompt gallery dataset for text-to-image generative models. In: ACL (2023)","DOI":"10.18653\/v1\/2023.acl-long.51"},{"key":"23_CR53","doi-asserted-by":"crossref","unstructured":"Wei, P., Liu, S., Yang, X., Wang, L., Zheng, B.: Towards personalized bundle creative generation with contrastive non-autoregressive decoding. In: SIGIR, pp. 2634\u20132638 (2022)","DOI":"10.1145\/3477495.3531909"},{"key":"23_CR54","unstructured":"Witteveen, S., Andrews, M.: Investigating prompt engineering in diffusion models. arXiv preprint arXiv:2211.15462 (2022)"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Xie, S., Girshick, R.B., Doll\u00e1r, P., Tu, Z., He, K.: Aggregated residual transformations for deep neural networks. In: CVPR, pp. 5987\u20135995 (2016)","DOI":"10.1109\/CVPR.2017.634"},{"key":"23_CR56","unstructured":"Xu, J., et al.: Imagereward: learning and evaluating human preferences for text-to-image generation. In: NeurIPS (2023)"},{"key":"23_CR57","doi-asserted-by":"crossref","unstructured":"Yang, K., et al.: Using human feedback to fine-tune diffusion models without any reward model. arXiv preprint arXiv:2311.13231 (2023)","DOI":"10.1109\/CVPR52733.2024.00854"},{"key":"23_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: ICCV, pp. 3813\u20133824 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"23_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Tzeng, E., Du, Y., Kislyuk, D.: Large-scale reinforcement learning for diffusion models. arXiv preprint arXiv:2401.12244 (2024)","DOI":"10.1007\/978-3-031-73036-8_1"},{"key":"23_CR60","unstructured":"Ziegler, D.M., et al.: Fine-tuning language models from human preferences. arXiv preprint arXiv:1909.08593 (2019)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72661-3_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,26]],"date-time":"2024-11-26T08:23:28Z","timestamp":1732609408000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72661-3_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,27]]},"ISBN":["9783031726606","9783031726613"],"references-count":60,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72661-3_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,27]]},"assertion":[{"value":"27 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}