{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T20:58:27Z","timestamp":1781297907677,"version":"3.54.1"},"reference-count":34,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,12,1]],"date-time":"2026-12-01T00:00:00Z","timestamp":1796083200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Displays"],"published-print":{"date-parts":[[2026,12]]},"DOI":"10.1016\/j.displa.2026.103566","type":"journal-article","created":{"date-parts":[[2026,6,9]],"date-time":"2026-06-09T23:43:07Z","timestamp":1781048587000},"page":"103566","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":0,"special_numbering":"C","title":["S2D-VidGen: Training-free video generation with frozen image diffusion models via coarse planning cues and consistent refinement"],"prefix":"10.1016","volume":"95","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-7045-3919","authenticated-orcid":false,"given":"Ziyu","family":"Chen","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9999-4871","authenticated-orcid":false,"given":"Hanli","family":"Wang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.displa.2026.103566_b1","unstructured":"J. Ho, T. Salimans, A. Gritsenko, W. Chan, M. Norouzi, D.J. Fleet, Video Diffusion Models, in: Proc. NeurIPS, 2022."},{"key":"10.1016\/j.displa.2026.103566_b2","series-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022"},{"key":"10.1016\/j.displa.2026.103566_b3","unstructured":"U. Singer, A. Polyak, T. Hayes, X. Yin, J. An, S. Zhang, Q. Hu, H. Yang, O. Ashual, O. Gafni, et al., Make-A-Video: Text-to-Video Generation without Text-Video Data, in: Proc. ICLR, 2023."},{"key":"10.1016\/j.displa.2026.103566_b4","series-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"10.1016\/j.displa.2026.103566_b5","doi-asserted-by":"crossref","unstructured":"O. Bar-Tal, H. Chefer, O. Tov, C. Herrmann, R. Paiss, S. Zada, A. Ephrat, J. Hur, Y. Li, T. Michaeli, et al., Lumiere: A Space-Time Diffusion Model for Video Generation, in: Proc. SIGGRAPH Asia, 2024.","DOI":"10.1145\/3680528.3687614"},{"key":"10.1016\/j.displa.2026.103566_b6","doi-asserted-by":"crossref","unstructured":"L. Khachatryan, A. Movsisyan, V. Tadevosyan, R. Henschel, Z. Wang, S. Navasardyan, H. Shi, Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators, in: Proc. ICCV, 2023, pp. 15954\u201315964.","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"10.1016\/j.displa.2026.103566_b7","doi-asserted-by":"crossref","unstructured":"D. Jagpal, H. Raza, H. Rauf, M. Salehi, V.M. Patel, EIDT-V: Exploiting Intersections in Diffusion Trajectories for Model-Agnostic, Zero-Shot, Training-Free Text-to-Video Generation, in: Proc. CVPR, 2025.","DOI":"10.1109\/CVPR52734.2025.01698"},{"key":"10.1016\/j.displa.2026.103566_b8","series-title":"DirecT2V: Large language models are frame-level directors for zero-shot text-to-video generation","author":"Hong","year":"2023"},{"key":"10.1016\/j.displa.2026.103566_b9","series-title":"Free-bloom: Zero-shot text-to-video generator with LLM director and LDM animator","author":"Huang","year":"2023"},{"key":"10.1016\/j.displa.2026.103566_b10","doi-asserted-by":"crossref","unstructured":"Y. Zhu, Y. Feng, L. Wang, X. Ji, Y. Gao, Z. Wang, C. Ma, J. Zhou, Training-free Geometric Image Editing on Diffusion Models, in: Proc. ICCV, 2025.","DOI":"10.1109\/ICCV51701.2025.01778"},{"key":"10.1016\/j.displa.2026.103566_b11","doi-asserted-by":"crossref","unstructured":"Y. Yang, Z. Cao, C. Bai, Z. Guo, Z. Liu, C.C. Loy, VLIPP: Towards Physically Plausible Video Generation with Vision and Language Informed Physical Prior, in: Proc. ICCV, 2025.","DOI":"10.1109\/ICCV51701.2025.01149"},{"key":"10.1016\/j.displa.2026.103566_b12","series-title":"LaVie: High-quality video generation with cascaded latent diffusion models","author":"Wang","year":"2024"},{"key":"10.1016\/j.displa.2026.103566_b13","unstructured":"J.Z. Wu, Y. Ge, X. Wang, S.W. Lei, Y. Gu, Y. Shi, W. Hsu, Y. Shan, X. Qie, M.Z. Shou, Tune-A-Video: One-Shot Tuning of Image Diffusion Models for Text-to-Video Generation, in: Proc. ICCV, 2023, pp. 7623\u20137633."},{"key":"10.1016\/j.displa.2026.103566_b14","unstructured":"Y. Guo, C. Yang, A. Rao, Z. Liang, Y. Wang, Y. Qiao, M. Agrawala, D. Lin, B. Dai, AnimateDiff: Animate Your Personalized Text-to-Image Diffusion Models without Specific Tuning, in: Proc. ICLR, 2024."},{"key":"10.1016\/j.displa.2026.103566_b15","unstructured":"H. Qiu, M. Xia, Y. Zhang, Y. Liu, X. He, Y. Shan, Z. Liu, FreeNoise: Tuning-Free Longer Video Diffusion via Noise Rescheduling, in: Proc. CVPR, 2024."},{"key":"10.1016\/j.displa.2026.103566_b16","doi-asserted-by":"crossref","unstructured":"L. Zhang, A. Rao, M. Agrawala, Adding Conditional Control to Text-to-Image Diffusion Models, in: Proc. ICCV, 2023, pp. 3836\u20133847.","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"10.1016\/j.displa.2026.103566_b17","series-title":"IP-Adapter: Text compatible image prompt adapter for text-to-image diffusion models","author":"Ye","year":"2023"},{"key":"10.1016\/j.displa.2026.103566_b18","unstructured":"C. Meng, Y. He, Y. Song, J. Song, J. Wu, J.-Y. Zhu, S. Ermon, SDEdit: Guided Image Synthesis and Editing with Stochastic Differential Equations, in: Proc. ICLR, 2022."},{"key":"10.1016\/j.displa.2026.103566_b19","unstructured":"A. Hertz, R. Mokady, J. Tenenbaum, K. Aberman, Y. Pritch, D. Cohen-Or, Prompt-to-Prompt Image Editing with Cross Attention Control, in: Proc. ICLR, 2023."},{"key":"10.1016\/j.displa.2026.103566_b20","doi-asserted-by":"crossref","unstructured":"N. Tumanyan, M. Geyer, S. Bagon, T. Dekel, Plug-and-Play Diffusion Features for Text-Driven Image-to-Image Translation, in: Proc. CVPR, 2023, pp. 1921\u20131930.","DOI":"10.1109\/CVPR52729.2023.00191"},{"key":"10.1016\/j.displa.2026.103566_b21","series-title":"The Llama 3 herd of models","author":"Grattafiori","year":"2024"},{"key":"10.1016\/j.displa.2026.103566_b22","doi-asserted-by":"crossref","unstructured":"S. Liu, Z. Zeng, T. Ren, F. Li, H. Zhang, J. Yang, C. Jiang, Q. Li, J. Yang, H. Su, et al., Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection, in: Proc. ECCV, 2024.","DOI":"10.1007\/978-3-031-72970-6_3"},{"key":"10.1016\/j.displa.2026.103566_b23","doi-asserted-by":"crossref","unstructured":"A. Kirillov, E. Mintun, N. Ravi, H. Mao, C. Rolland, L. Gustafson, T. Xiao, S. Whitehead, A.C. Berg, W.-Y. Lo, et al., Segment Anything, in: Proc. ICCV, 2023, pp. 4015\u20134026.","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"10.1016\/j.displa.2026.103566_b24","unstructured":"J. Song, C. Meng, S. Ermon, Denoising Diffusion Implicit Models, in: Proc. ICLR, 2021."},{"key":"10.1016\/j.displa.2026.103566_b25","series-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2019"},{"key":"10.1016\/j.displa.2026.103566_b26","series-title":"Wan: Open and advanced large-scale video generative models","author":"Wan","year":"2025"},{"key":"10.1016\/j.displa.2026.103566_b27","series-title":"UCF101: A dataset of 101 human actions classes from videos in the wild","author":"Soomro","year":"2012"},{"key":"10.1016\/j.displa.2026.103566_b28","doi-asserted-by":"crossref","unstructured":"W. Xiong, W. Luo, L. Ma, W. Liu, J. Luo, Learning to Generate Time-Lapse Videos Using Multi-Stage Dynamic Generative Adversarial Networks, in: Proc. CVPR, 2018.","DOI":"10.1109\/CVPR.2018.00251"},{"key":"10.1016\/j.displa.2026.103566_b29","series-title":"The kinetics human action video dataset","author":"Kay","year":"2017"},{"key":"10.1016\/j.displa.2026.103566_b30","series-title":"FaceForensics: A large-scale video dataset for forgery detection in human faces","author":"R\u00f6ssler","year":"2018"},{"key":"10.1016\/j.displa.2026.103566_b31","doi-asserted-by":"crossref","unstructured":"S. Ge, A. Mahapatra, G. Parmar, J.-Y. Zhu, J.-B. Huang, On the Content Bias in Fr\u00e9chet Video Distance, in: Proc. CVPR, 2024.","DOI":"10.1109\/CVPR52733.2024.00695"},{"key":"10.1016\/j.displa.2026.103566_b32","doi-asserted-by":"crossref","unstructured":"Z. Wang, E.P. Simoncelli, A.C. Bovik, Multiscale Structural Similarity for Image Quality Assessment, in: Proc. Asilomar Conf. Signals Syst. Comput., 2003, pp. 1398\u20131402.","DOI":"10.1109\/ACSSC.2003.1292216"},{"key":"10.1016\/j.displa.2026.103566_b33","doi-asserted-by":"crossref","unstructured":"R. Zhang, P. Isola, A.A. Efros, E. Shechtman, O. Wang, The Unreasonable Effectiveness of Deep Features as a Perceptual Metric, in: Proc. CVPR, 2018.","DOI":"10.1109\/CVPR.2018.00068"},{"key":"10.1016\/j.displa.2026.103566_b34","doi-asserted-by":"crossref","unstructured":"H. Chen, Y. Zhang, X. Cun, M. Xia, X. Wang, C. Weng, Y. Shan, VideoCrafter2: Overcoming Data Limitations for High-Quality Video Diffusion Models, in: Proc. CVPR, 2024.","DOI":"10.1109\/CVPR52733.2024.00698"}],"container-title":["Displays"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226002295?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0141938226002295?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,12]],"date-time":"2026-06-12T20:14:46Z","timestamp":1781295286000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0141938226002295"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,12]]},"references-count":34,"alternative-id":["S0141938226002295"],"URL":"https:\/\/doi.org\/10.1016\/j.displa.2026.103566","relation":{},"ISSN":["0141-9382"],"issn-type":[{"value":"0141-9382","type":"print"}],"subject":[],"published":{"date-parts":[[2026,12]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"S2D-VidGen: Training-free video generation with frozen image diffusion models via coarse planning cues and consistent refinement","name":"articletitle","label":"Article Title"},{"value":"Displays","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.displa.2026.103566","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier B.V. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"103566"}}