{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:53:53Z","timestamp":1763790833732,"version":"3.45.0"},"publisher-location":"Singapore","reference-count":43,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819533459","type":"print"},{"value":"9789819533466","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,23]],"date-time":"2025-11-23T00:00:00Z","timestamp":1763856000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3346-6_3","type":"book-chapter","created":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:49:36Z","timestamp":1763790576000},"page":"34-47","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["StoryBench: A Dataset for\u00a0Diverse, Explainable, Multi-hop Narrative Text-to-Image Generation"],"prefix":"10.1007","author":[{"given":"Yuan","family":"Ge","sequence":"first","affiliation":[]},{"given":"Kaiyang","family":"Ye","sequence":"additional","affiliation":[]},{"given":"Saihan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Aokai","family":"Hao","sequence":"additional","affiliation":[]},{"given":"Xiangnan","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Kaiyan","family":"Chang","sequence":"additional","affiliation":[]},{"given":"Tong","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Jingbo","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,23]]},"reference":[{"key":"3_CR1","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., et\u00a0al.: High-resolution image synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on CVPR, pp. 10684\u201310695 (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"3_CR2","unstructured":"Betker, J., Goh, G., Jing, L., et\u00a0al.: Improving image generation with better captions. Comput. Sci. 2(3), 8 (2023). https:\/\/cdnopenai.com\/papers\/dall-e-3.pdf"},{"key":"3_CR3","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers. In: Proceedings of the IEEE\/CVF ICCV, pp. 4195\u20134205 (2023)","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"3_CR4","unstructured":"Polyak, A., Zohar, A., Brown, A., et\u00a0al.: Movie gen: a cast of media foundation models. arXiv preprint arXiv:2410.13720 (2024)"},{"key":"3_CR5","unstructured":"Kong, W., Tian, Q., Zhang, Z., et\u00a0al.: HunyuanVideo: a systematic framework for large video generative models. arXiv preprint arXiv:2412.03603 (2024)"},{"key":"3_CR6","unstructured":"Esser, P., Kulal, S., Blattmann, A., et\u00a0al.: Scaling rectified flow transformers for high-resolution image synthesis. In: Forty-First ICML (2024)"},{"key":"3_CR7","unstructured":"Runway: Introducing gen-3 alpha: A new frontier for video generation (2024). https:\/\/runwayml.com\/research\/introducing-gen-3-alpha"},{"key":"3_CR8","unstructured":"OpenAI: Video generation models as world simulators (2024). https:\/\/openai.com\/index\/video-generation-models-as-world-simulators\/"},{"key":"3_CR9","unstructured":"Danto, A.C.: The transfiguration of the Commonplace: A Philosophy of Art. Harvard University Press (1981)"},{"key":"3_CR10","doi-asserted-by":"crossref","unstructured":"Gombrich, E.H.: Art and illusion: a study in the psychology of pictorial representation-millennium edition (2023)","DOI":"10.2307\/jj.5425926"},{"key":"3_CR11","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., et al.: Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural. Inf. Process. Syst. 35, 36479\u201336494 (2022)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR12","doi-asserted-by":"crossref","unstructured":"Cho, J., Zala, A., Bansal, M.: DALL-Eval: probing the reasoning skills and social biases of text-to-image generation models. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 3043\u20133054 (2023)","DOI":"10.1109\/ICCV51070.2023.00283"},{"key":"3_CR13","first-page":"78723","volume":"36","author":"K Huang","year":"2023","unstructured":"Huang, K., Sun, K., Xie, E., et al.: T2I-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation. Adv. Neural. Inf. Process. Syst. 36, 78723\u201378747 (2023)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"3_CR14","unstructured":"Wu, X., Yu, D., Huang, Y., et\u00a0al.: ConceptMix: a compositional image generation benchmark with controllable difficulty. arXiv preprint arXiv:2408.14339 (2024)"},{"key":"3_CR15","unstructured":"Yang, Y., Lin, Y., Liu, H., et\u00a0al.: Position: towards implicit prompt for text-to-image models. In: Forty-First International Conference on Machine Learning (2024)"},{"key":"3_CR16","unstructured":"Meng, F., Shao, W., Luo, L., et\u00a0al.: PhyBench: a physical commonsense benchmark for evaluating text-to-image models. arXiv preprint arXiv:2406.11802 (2024)"},{"key":"3_CR17","unstructured":"Lee, T., Yasunaga, M., Meng, C., et\u00a0al.: Holistic evaluation of text-to-image models. Adv. Neural Inf. Process. Syst. 36 (2024)"},{"key":"3_CR18","doi-asserted-by":"crossref","unstructured":"Li, B., Lin, Z., Pathak, D., et\u00a0al.: GenAI-bench: evaluating and improving compositional text-to-visual generation. arXiv preprint arXiv:2406.13743 (2024)","DOI":"10.1109\/CVPRW63382.2024.00538"},{"key":"3_CR19","doi-asserted-by":"crossref","unstructured":"Li, Y., Gan, Z., Shen, Y., et\u00a0al.: StoryGAN: a sequential conditional GAN for story visualization. In: Proceedings of the IEEE\/CVF Conference on CVPR, pp. 6329\u20136338 (2019)","DOI":"10.1109\/CVPR.2019.00649"},{"key":"3_CR20","doi-asserted-by":"publisher","unstructured":"Chen, H., Han, R., Wu, T.L., et\u00a0al.: Character-centric story visualization via visual planning and token alignment. In: Goldberg, Y., Kozareva, Z., Zhang, Y. (eds.) Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing, Abu Dhabi, United Arab Emirates, pp. 8259\u20138272. Association for Computational Linguistics (2022). https:\/\/doi.org\/10.18653\/v1\/2022.emnlp-main.565. https:\/\/aclanthology.org\/2022.emnlp-main.565\/","DOI":"10.18653\/v1\/2022.emnlp-main.565"},{"key":"3_CR21","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"70","DOI":"10.1007\/978-3-031-19836-6_5","volume-title":"ECCV 2022","author":"A Maharana","year":"2022","unstructured":"Maharana, A., Hannan, D., Bansal, M.: STORYDALL-E: adapting pretrained text-to-image transformers for story continuation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13697, pp. 70\u201387. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19836-6_5"},{"key":"3_CR22","doi-asserted-by":"crossref","unstructured":"Rahman, T., Lee, H.Y., Ren, J., et\u00a0al.: Make-a-story: visual memory conditioned consistent story generation. In: Proceedings of the IEEE\/CVF Conference on CVPR, pp. 2493\u20132502 (2023)","DOI":"10.1109\/CVPR52729.2023.00246"},{"key":"3_CR23","doi-asserted-by":"crossref","unstructured":"Liu, C., Wu, H., Zhong, Y., et\u00a0al.: Intelligent grimm-open-ended visual storytelling via latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on CVPR, pp. 6190\u20136200 (2024)","DOI":"10.1109\/CVPR52733.2024.00592"},{"key":"3_CR24","doi-asserted-by":"crossref","unstructured":"Kou, Z., Pei, S., Zhang, X.: LeMon: automating portrait generation for zero-shot story visualization with multi-character interactions. In: Proceedings of the 30th ACM SIGKDD, pp. 1418\u20131427 (2024)","DOI":"10.1145\/3637528.3671850"},{"key":"3_CR25","doi-asserted-by":"crossref","unstructured":"Wu, J., Tang, C., Wang, J., et\u00a0al.: DiffSensei: bridging multi-modal LLMs and diffusion models for customized manga generation. arXiv preprint arXiv:2412.07589 (2024)","DOI":"10.1109\/CVPR52734.2025.02671"},{"key":"3_CR26","doi-asserted-by":"crossref","unstructured":"Kajic, I., Wiles, O., Albuquerque, I., et\u00a0al.: Evaluating numerical reasoning in text-to-image models. In: The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track (2024)","DOI":"10.52202\/079017-1335"},{"key":"3_CR27","unstructured":"Gokhale, T., Palangi, H., Nushi, B., et\u00a0al.: Benchmarking spatial relationships in text-to-image generation. arXiv preprint arXiv:2212.10015 (2022)"},{"key":"3_CR28","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M., et\u00a0al.: Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)"},{"key":"3_CR29","unstructured":"Wang, X., Wei, J., Schuurmans, D., et\u00a0al.: Self-consistency improves chain of thought reasoning in language models. In: The Eleventh International Conference on Learning Representations, ICLR 2023. Kigali, Rwanda, 1\u20135 May 2023"},{"key":"3_CR30","unstructured":"Hu, C., Ge, Y., Ma, X., et\u00a0al.: RankPrompt: step-by-step comparisons make language models better reasoners. In: Calzolari, N., Kan, M.Y., Hoste, V., et\u00a0al. (eds.) Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pp. 13524\u201313536. ELRA and ICCL, Torino, Italia, May 2024. https:\/\/aclanthology.org\/2024.lrec-main.1183\/"},{"key":"3_CR31","unstructured":"Ye, J., Gong, S., Chen, L., et\u00a0al.: Diffusion of thought: chain-of-thought reasoning in diffusion language models. In: The Thirty-Eighth Annual Conference on Neural Information Processing Systems (2024)"},{"key":"3_CR32","doi-asserted-by":"publisher","unstructured":"Talmor, A., Herzig, J., Lourie, N., et\u00a0al.: CommonsenseQA: a question answering challenge targeting commonsense knowledge. In: Burstein, J., Doran, C., Solorio, T. (eds.) Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), Minneapolis, Minnesota, pp. 4149\u20134158. Association for Computational Linguistics (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1421. https:\/\/aclanthology.org\/N19-1421\/","DOI":"10.18653\/v1\/N19-1421"},{"key":"3_CR33","unstructured":"Chen, M., Tworek, J., Jun, H., et\u00a0al.: Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374 (2021)"},{"key":"3_CR34","unstructured":"Ye, J., Gong, S., Chen, L., et\u00a0al.: Diffusion of thoughts: chain-of-thought reasoning in diffusion language models. In: NeurIPS (2024)"},{"key":"3_CR35","doi-asserted-by":"crossref","unstructured":"Yang, Z., Qi, P., Zhang, S., et\u00a0al.: HotpotQA: a dataset for diverse, explainable multi-hop question answering. arXiv preprint arXiv:1809.09600 (2018)","DOI":"10.18653\/v1\/D18-1259"},{"key":"3_CR36","unstructured":"Podell, D., English, Z., Lacey, K., et\u00a0al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"3_CR37","unstructured":"Labs, B.F.: Flux (2024). https:\/\/github.com\/black-forest-labs\/flux"},{"key":"3_CR38","unstructured":"Midjourney: Midjourney (2024). https:\/\/www.midjourney.com\/"},{"key":"3_CR39","doi-asserted-by":"crossref","unstructured":"Horv\u00e1th, G.: Visual imagination and the narrative image. Parallelisms between art history and neuroscience. Cortex 105, 144\u2013154 (2018)","DOI":"10.1016\/j.cortex.2018.06.007"},{"key":"3_CR40","unstructured":"Chen, Z., Wang, W., Cao, Y., et\u00a0al.: Expanding performance boundaries of open-source multimodal models with model, data, and test-time scaling. arXiv preprint arXiv:2412.05271 (2024)"},{"key":"3_CR41","unstructured":"Wang, Y., Yu, Z., Yao, W., et\u00a0al.: PandaLM: an automatic evaluation benchmark for LLM instruction tuning optimization. In: The Twelfth International Conference on Learning Representations, ICLR (2023)"},{"key":"3_CR42","unstructured":"Kaplan, J., McCandlish, S., Henighan, T., et\u00a0al.: Scaling laws for neural language models. arXiv preprint arXiv:2001.08361 (2020)"},{"key":"3_CR43","first-page":"54872","volume":"37","author":"Q Chen","year":"2024","unstructured":"Chen, Q., Qin, L., Wang, J., et al.: Unlocking the capabilities of thought: a reasoning boundary framework to quantify and optimize chain-of-thought. Adv. Neural. Inf. Process. Syst. 37, 54872\u201354904 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3346-6_3","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,22]],"date-time":"2025-11-22T05:49:47Z","timestamp":1763790587000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3346-6_3"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,23]]},"ISBN":["9789819533459","9789819533466"],"references-count":43,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3346-6_3","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,23]]},"assertion":[{"value":"23 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Urumqi","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"9 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2025\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}