{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:37:53Z","timestamp":1767339473918,"version":"3.40.3"},"publisher-location":"Cham","reference-count":43,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031726699"},{"type":"electronic","value":"9783031726705"}],"license":[{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,9,30]],"date-time":"2024-09-30T00:00:00Z","timestamp":1727654400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72670-5_24","type":"book-chapter","created":{"date-parts":[[2024,9,29]],"date-time":"2024-09-29T07:01:50Z","timestamp":1727593310000},"page":"422-438","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["The Fabrication of\u00a0Reality and\u00a0Fantasy: Scene Generation with\u00a0LLM-Assisted Prompt Interpretation"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8227-5662","authenticated-orcid":false,"given":"Yi","family":"Yao","sequence":"first","affiliation":[]},{"given":"Chan-Feng","family":"Hsu","sequence":"additional","affiliation":[]},{"given":"Jhe-Hao","family":"Lin","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5652-4327","authenticated-orcid":false,"given":"Hongxia","family":"Xie","sequence":"additional","affiliation":[]},{"given":"Terence","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Yi-Ning","family":"Huang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2216-077X","authenticated-orcid":false,"given":"Hong-Han","family":"Shuai","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4662-7875","authenticated-orcid":false,"given":"Wen-Huang","family":"Cheng","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,9,30]]},"reference":[{"key":"24_CR1","doi-asserted-by":"crossref","unstructured":"Anciukevi\u010dius, T., et al.: RenderDiffusion: image diffusion for 3d reconstruction, inpainting and generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12608\u201312618 (2023)","DOI":"10.1109\/CVPR52729.2023.01213"},{"key":"24_CR2","unstructured":"Bar-Tal, O., Yariv, L., Lipman, Y., Dekel, T.: MultiDiffusion: fusing diffusion paths for controlled image generation. In: International Conference on Machine Learning (2023)"},{"issue":"4","key":"24_CR3","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3592116","volume":"42","author":"H Chefer","year":"2023","unstructured":"Chefer, H., Alaluf, Y., Vinker, Y., Wolf, L., Cohen-Or, D.: Attend-and-excite: attention-based semantic guidance for text-to-image diffusion models. ACM Trans. Graph. (TOG) 42(4), 1\u201310 (2023)","journal-title":"ACM Trans. Graph. (TOG)"},{"key":"24_CR4","unstructured":"Feng, W., et al.: Training-free structured diffusion guidance for compositional text-to-image synthesis. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=PUIqjT4rzq7"},{"key":"24_CR5","unstructured":"Feng, W., et al.: LayoutGPT: compositional visual planning and generation with large language models. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2024)"},{"key":"24_CR6","doi-asserted-by":"crossref","unstructured":"Friedrich, F., et al.: Fair diffusion: instructing text-to-image generation models on fairness. arXiv preprint arXiv:2302.10893 (2023)","DOI":"10.1007\/s43681-024-00531-5"},{"key":"24_CR7","unstructured":"Gani, H., Bhat, S.F., Naseer, M., Khan, S., Wonka, P.: LLM blueprint: enabling text-to-image generation with complex and detailed prompts. In: International Conference on Learning Representations (2024)"},{"key":"24_CR8","unstructured":"Golnari, P.A.: LORA-enhanced distillation on guided diffusion models. arXiv preprint arXiv:2312.06899 (2023)"},{"key":"24_CR9","doi-asserted-by":"crossref","unstructured":"Gong, J., Foo, L.G., Fan, Z., Ke, Q., Rahmani, H., Liu, J.: DiffPose: toward more reliable 3D pose estimation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 13041\u201313051 (2023)","DOI":"10.1109\/CVPR52729.2023.01253"},{"key":"24_CR10","doi-asserted-by":"crossref","unstructured":"Hessel, J., Holtzman, A., Forbes, M., Bras, R.L., Choi, Y.: CLIPScore: a reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718 (2021)","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"24_CR11","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. In: Advances in Neural Information Processing Systems Workshop (2021). https:\/\/openreview.net\/forum?id=qw8AKxfYbI"},{"key":"24_CR12","unstructured":"Hu, E.J., et al.: LORA: low-rank adaptation of large language models. In: International Conference on Learning Representations (2022)"},{"key":"24_CR13","unstructured":"Huang, K., Sun, K., Xie, E., Li, Z., Liu, X.: T2I-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2024)"},{"key":"24_CR14","doi-asserted-by":"crossref","unstructured":"Kemker, R., McClure, M., Abitino, A., Hayes, T., Kanan, C.: Measuring catastrophic forgetting in neural networks. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11651"},{"key":"24_CR15","doi-asserted-by":"crossref","unstructured":"Qu, L., Wu, S., Fei, H., Nie, L., Chua, T.S.: LayoutLLM-T2I: eliciting layout guidance from LLM for text-to-image generation. In: Proceedings of the ACM International Conference on Multimedia (2023)","DOI":"10.1145\/3581783.3612012"},{"key":"24_CR16","unstructured":"Lian, L., Li, B., Yala, A., Darrell, T.: LLM-grounded diffusion: enhancing prompt understanding of text-to-image diffusion models with large language models. arXiv preprint arXiv:2305.13655 (2023)"},{"key":"24_CR17","unstructured":"Lian, L., Shi, B., Yala, A., Darrell, T., Li, B.: LLM-grounded video diffusion models. arXiv preprint arXiv:2309.17444 (2023)"},{"key":"24_CR18","unstructured":"Liu, H., Li, C., Wu, Q., Lee, Y.J.: Visual instruction tuning. In: Advances in Neural Information Processing Systems (2023)"},{"key":"24_CR19","unstructured":"Luccioni, S., Akiki, C., Mitchell, M., Jernite, Y.: Stable bias: evaluating societal representations in diffusion models. In: Advances in Neural Information Processing Systems, vol.\u00a036 (2024)"},{"key":"24_CR20","unstructured":"Mantri, K.S.I., Sasikumar, N.: Interactive fashion content generation using LLMs and latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition Workshop (2023)"},{"key":"24_CR21","doi-asserted-by":"publisher","unstructured":"Naik, R., Nushi, B.: Social biases through the text-to-image generation lens. In: Proceedings of the AAAI\/ACM Conference on AI, Ethics, and Society, pp. 786\u2013808. AIES 2023, Association for Computing Machinery, New York, NY, USA (2023). https:\/\/doi.org\/10.1145\/3600211.3604711","DOI":"10.1145\/3600211.3604711"},{"key":"24_CR22","doi-asserted-by":"crossref","unstructured":"Nair, N.G., et al.: Steered diffusion: a generalized framework for plug-and-play conditional image synthesis. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 20850\u201320860 (2023)","DOI":"10.1109\/ICCV51070.2023.01906"},{"key":"24_CR23","unstructured":"Nichol, A., et al.: GLIDE: towards photorealistic image generation and editing with text-guided diffusion models. In: Proceedings of Machine Learning Research, pp. 16784\u201316804 (2022)"},{"key":"24_CR24","doi-asserted-by":"publisher","unstructured":"Orgad, H., Kawar, B., Belinkov, Y.: Editing implicit assumptions in text-to-image diffusion models. In: 2023 IEEE\/CVF International Conference on Computer Vision, pp. 7030\u20137038. IEEE Computer Society, Los Alamitos, CA, USA (2023). https:\/\/doi.org\/10.1109\/ICCV51070.2023.00649","DOI":"10.1109\/ICCV51070.2023.00649"},{"key":"24_CR25","doi-asserted-by":"crossref","unstructured":"Perera, M.V., Patel, V.M.: Analyzing bias in diffusion-based face generation models. arXiv preprint arXiv:2305.06402 (2023)","DOI":"10.1109\/IJCB57857.2023.10449200"},{"key":"24_CR26","doi-asserted-by":"crossref","unstructured":"Phung, Q., Ge, S., Huang, J.B.: Grounded text-to-image synthesis with attention refocusing. arXiv preprint arXiv:2306.05427 (2023)","DOI":"10.1109\/CVPR52733.2024.00758"},{"key":"24_CR27","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. In: International Conference on Learning Representations (2024). https:\/\/openreview.net\/forum?id=di52zR8xgf"},{"key":"24_CR28","unstructured":"Qin, J., et al.: DiffusionGPT: LLM-driven text-to-image generation system. arXiv preprint arXiv:2401.10061 (2024)"},{"key":"24_CR29","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"24_CR30","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with CLIP latents. arXiv preprint arXiv:2204.061251(2), 3 (2022)"},{"key":"24_CR31","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"24_CR32","unstructured":"Saharia, C., et\u00a0al.: Photorealistic text-to-image diffusion models with deep language understanding. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 36479\u201336494 (2022)"},{"key":"24_CR33","unstructured":"Smith, J.S., et al.: Continual diffusion: continual customization of text-to-image diffusion with C-LORA. arXiv preprint arXiv:2304.06027 (2023)"},{"key":"24_CR34","doi-asserted-by":"crossref","unstructured":"Su, X., et al.: Unbiased image synthesis via manifold-driven sampling in diffusion models. arXiv preprint arXiv:2307.08199 (2023)","DOI":"10.1109\/ICME57554.2024.10687809"},{"key":"24_CR35","unstructured":"Wei, J., et\u00a0al.: Chain-of-thought prompting elicits reasoning in large language models. In: Advances in Neural Information Processing Systems, vol.\u00a035, pp. 24824\u201324837 (2022)"},{"key":"24_CR36","doi-asserted-by":"crossref","unstructured":"Wu, T.H., Lian, L., Gonzalez, J.E., Li, B., Darrell, T.: Self-correcting LLM-controlled diffusion models. arXiv preprint arXiv:2311.16090 (2023)","DOI":"10.1109\/CVPR52733.2024.00605"},{"key":"24_CR37","doi-asserted-by":"crossref","unstructured":"Xie, J., et al.: BoxDiff: text-to-image synthesis with training-free box-constrained diffusion. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 7452\u20137461 (2023)","DOI":"10.1109\/ICCV51070.2023.00685"},{"key":"24_CR38","unstructured":"Yang, L., Yu, Z., Meng, C., Xu, M., Ermon, S., Cui, B.: Mastering text-to-image diffusion: recaptioning, planning, and generating with multimodal LLMS. arXiv preprint arXiv:2401.11708 (2024)"},{"key":"24_CR39","doi-asserted-by":"publisher","unstructured":"Yang, L., et al.: Diffusion models: a comprehensive survey of methods and applications. ACM Comput. Surv. 56(4) (2023). https:\/\/doi.org\/10.1145\/3626235","DOI":"10.1145\/3626235"},{"key":"24_CR40","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: ReCo: region-controlled text-to-image generation. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision and Pattern Recognition, pp. 14246\u201314255 (2023)","DOI":"10.1109\/CVPR52729.2023.01369"},{"key":"24_CR41","unstructured":"Zhang, C., Zhang, C., Zhang, M., Kweon, I.S.: Text-to-image diffusion model in generative AI: A survey. arXiv preprint arXiv:2303.07909 (2023)"},{"key":"24_CR42","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., Agrawala, M.: Adding conditional control to text-to-image diffusion models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3836\u20133847 (2023)","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"24_CR43","unstructured":"Zhang, T., Wang, Z., Huang, J., Tasnim, M.M., Shi, W.: A survey of diffusion based image generation models: issues and their solutions. arXiv preprint arXiv:2308.13142 (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72670-5_24","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,28]],"date-time":"2024-11-28T21:20:21Z","timestamp":1732828821000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72670-5_24"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,9,30]]},"ISBN":["9783031726699","9783031726705"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72670-5_24","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,9,30]]},"assertion":[{"value":"30 September 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}