{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T12:07:24Z","timestamp":1743077244989,"version":"3.40.3"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031733895"},{"type":"electronic","value":"9783031733901"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73390-1_15","type":"book-chapter","created":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T16:24:01Z","timestamp":1730305441000},"page":"250-266","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Stable Preference: Redefining Training Paradigm of\u00a0Human Preference Model for\u00a0Text-to-Image Synthesis"],"prefix":"10.1007","author":[{"given":"Hanting","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hongjing","family":"Niu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Feng","family":"Zhao","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"15_CR1","unstructured":"Chen, X., Fang, H., Lin, T.Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"issue":"1","key":"15_CR2","doi-asserted-by":"publisher","first-page":"121","DOI":"10.3758\/BF03210784","volume":"4","author":"GJ Digirolamo","year":"1997","unstructured":"Digirolamo, G.J., Hintzman, D.L.: First impressions are lasting impressions: a primacy effect in memory for repetitions. Psychonomic Bull. Rev. 4(1), 121\u2013124 (1997)","journal-title":"Psychonomic Bull. Rev."},{"key":"15_CR3","first-page":"19822","volume":"34","author":"M Ding","year":"2021","unstructured":"Ding, M., et al.: Cogview: mastering text-to-image generation via transformers. Adv. Neural. Inf. Process. Syst. 34, 19822\u201319835 (2021)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR4","first-page":"16890","volume":"35","author":"M Ding","year":"2022","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: Cogview2: faster and better text-to-image generation via hierarchical transformers. Adv. Neural. Inf. Process. Syst. 35, 16890\u201316902 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR5","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 12873\u201312883 (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"15_CR6","doi-asserted-by":"crossref","unstructured":"Gu, S., et al.: Vector quantized diffusion model for text-to-image synthesis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 10696\u201310706 (2022)","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"15_CR7","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: Gans trained by a two time-scale update rule converge to a local nash equilibrium. Adv. Neural Inf. Process. Syst. 30 (2017)"},{"issue":"3","key":"15_CR8","doi-asserted-by":"publisher","first-page":"1552","DOI":"10.1109\/TPAMI.2020.3021209","volume":"44","author":"T Hinz","year":"2020","unstructured":"Hinz, T., Heinrich, S., Wermter, S.: Semantic object accuracy for generative text-to-image synthesis. IEEE Trans. Pattern Anal. Mach. Intell. 44(3), 1552\u20131565 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"15_CR9","doi-asserted-by":"crossref","unstructured":"Hong, S., Yang, D., Choi, J., Lee, H.: Inferring semantic layout for hierarchical text-to-image synthesis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 7986\u20137994 (2018)","DOI":"10.1109\/CVPR.2018.00833"},{"key":"15_CR10","doi-asserted-by":"publisher","unstructured":"Ilharco, G., et al.: Openclip, July 2021. https:\/\/doi.org\/10.5281\/zenodo.5143773, https:\/\/doi.org\/10.5281\/zenodo.5143773, if you use this software, please cite it as below","DOI":"10.5281\/zenodo.5143773"},{"key":"15_CR11","doi-asserted-by":"crossref","unstructured":"Judd, T., Ehinger, K., Durand, F., Torralba, A.: Learning to predict where humans look. In: Proc. IEEE Int. Conf. Comput. Vis., pp. 2106\u20132113. IEEE (2009)","DOI":"10.1109\/ICCV.2009.5459462"},{"key":"15_CR12","unstructured":"Kirstain, Y., Polyak, A., Singer, U., Matiana, S., Penna, J., Levy, O.: Pick-a-pic: an open dataset of user preferences for text-to-image generation. arXiv preprint arXiv:2305.01569 (2023)"},{"key":"15_CR13","unstructured":"Kynk\u00e4\u00e4nniemi, T., Karras, T., Laine, S., Lehtinen, J., Aila, T.: Improved precision and recall metric for assessing generative models. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"15_CR14","unstructured":"Li, J., Li, D., Xiong, C., Hoi, S.: Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: Proc. Int. Conf. Mach. Learn., pp. 12888\u201312900. PMLR (2022)"},{"key":"15_CR15","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Goyal, P., Girshick, R., He, K., Doll\u00e1r, P.: Focal loss for dense object detection. In: Proc. IEEE Int. Conf. Comput. Vis., pp. 2980\u20132988 (2017)","DOI":"10.1109\/ICCV.2017.324"},{"key":"15_CR16","unstructured":"Liu, X., Gong, C., Wu, L., Zhang, S., Su, H., Liu, Q.: Fusedream: training-free text-to-image generation with improved CLIP+GAN space optimization. arXiv preprint arXiv:2112.01573 (2021)"},{"key":"15_CR17","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"15_CR18","doi-asserted-by":"crossref","unstructured":"Murray, N., Marchesotti, L., Perronnin, F.: Ava: a large-scale database for aesthetic visual analysis. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 2408\u20132415. IEEE (2012)","DOI":"10.1109\/CVPR.2012.6247954"},{"key":"15_CR19","unstructured":"Naeem, M.F., Oh, S.J., Uh, Y., Choi, Y., Yoo, J.: Reliable fidelity and diversity metrics for generative models. In: Proc. Int. Conf. Mach. Learn., pp. 7176\u20137185. PMLR (2020)"},{"key":"15_CR20","unstructured":"Nichol, A., et al.: Glide: Towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"issue":"3","key":"15_CR21","first-page":"421","volume":"21","author":"SE Palmer","year":"2008","unstructured":"Palmer, S.E., et al.: Aesthetic issues in spatial composition: effects of position and direction on framing single objects. Spat. Vis. 21(3), 421\u2013450 (2008)","journal-title":"Spat. Vis."},{"key":"15_CR22","unstructured":"Park, D.H., Azadi, S., Liu, X., Darrell, T., Rohrbach, A.: Benchmark for compositional text-to-image synthesis. In: Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 1) (2021)"},{"key":"15_CR23","unstructured":"Paszke, A., et\u00a0al.: Pytorch: an imperative style, high-performance deep learning library. Adv. Neural Inf. Process. Syst. 32 (2019)"},{"key":"15_CR24","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: Proc. Int. Conf. Mach. Learn., pp. 8748\u20138763. PMLR (2021)"},{"key":"15_CR25","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"15_CR26","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: Proc. Int. Conf. Mach. Learn., pp. 8821\u20138831. PMLR (2021)"},{"key":"15_CR27","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"15_CR28","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR29","unstructured":"Sajjadi, M.S., Bachem, O., Lucic, M., Bousquet, O., Gelly, S.: Assessing generative models via precision and recall. Adv. Neural Inf. Process. Syst. 31 (2018)"},{"key":"15_CR30","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training gans. Adv. Neural Inf. Process. Syst. 29 (2016)"},{"key":"15_CR31","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., et al.: Laion-5b: an open large-scale dataset for training next generation image-text models. Adv. Neural. Inf. Process. Syst. 35, 25278\u201325294 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"15_CR32","doi-asserted-by":"crossref","unstructured":"Wang, Z.J., Montoya, E., Munechika, D., Yang, H., Hoover, B., Chau, D.H.: Diffusiondb: a large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:2210.14896 (2022)","DOI":"10.18653\/v1\/2023.acl-long.51"},{"key":"15_CR33","unstructured":"Wu, X., et al.: Human preference score v2: a solid benchmark for evaluating human preferences of text-to-image synthesis. arXiv preprint arXiv:2306.09341 (2023)"},{"key":"15_CR34","doi-asserted-by":"crossref","unstructured":"Wu, X., Sun, K., Zhu, F., Zhao, R., Li, H.: Human preference score: Better aligning text-to-image models with human preference. In: Proc. IEEE Int. Conf. Comput. Vis., pp. 2096\u20132105 (2023)","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"15_CR35","unstructured":"Xu, J., et al.: Imagereward: Learning and evaluating human preferences for text-to-image generation. arXiv preprint arXiv:2304.05977 (2023)"},{"key":"15_CR36","doi-asserted-by":"crossref","unstructured":"Xu, T., Zhang, P., Huang, Q., Zhang, H., Gan, Z., Huang, X., He, X.: Attngan: fine-grained text to image generation with attentional generative adversarial networks. In: Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit., pp. 1316\u20131324 (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"15_CR37","doi-asserted-by":"crossref","unstructured":"Xu, X., Wang, Z., Zhang, G., Wang, K., Shi, H.: Versatile diffusion: text, images and variations all in one diffusion model. In: Proc. IEEE Int. Conf. Comput. Vis., pp. 7754\u20137765 (2023)","DOI":"10.1109\/ICCV51070.2023.00713"},{"key":"15_CR38","unstructured":"Zhang, X., et al.: Gpt-4v (ision) as a generalist evaluator for vision-language tasks. arXiv preprint arXiv:2311.01361 (2023)"},{"key":"15_CR39","unstructured":"Zhou, Y., et al.: LAFITE: towards language-free training for text-to-image generation. arxiv 2021. arXiv preprint arXiv:2111.13792"},{"key":"15_CR40","unstructured":"Zhu, D., Chen, J., Shen, X., Li, X., Elhoseiny, M.: Minigpt-4: enhancing vision-language understanding with advanced large language models. arXiv preprint arXiv:2304.10592 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73390-1_15","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,30]],"date-time":"2024-10-30T16:32:16Z","timestamp":1730305936000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73390-1_15"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031733895","9783031733901"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73390-1_15","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}