{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:39:01Z","timestamp":1778081941635,"version":"3.51.4"},"reference-count":89,"publisher":"Springer Science and Business Media LLC","issue":"5","license":[{"start":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T00:00:00Z","timestamp":1734912000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T00:00:00Z","timestamp":1734912000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"National Key R&D Program China","award":["2022ZD0160102"],"award-info":[{"award-number":["2022ZD0160102"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62102150"],"award-info":[{"award-number":["62102150"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","award":["23QD1400800"],"award-info":[{"award-number":["23QD1400800"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003399","name":"Science and Technology Commission of Shanghai Municipality","doi-asserted-by":"publisher","award":["23YF1461900"],"award-info":[{"award-number":["23YF1461900"]}],"id":[{"id":"10.13039\/501100003399","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1007\/s11263-024-02295-1","type":"journal-article","created":{"date-parts":[[2024,12,23]],"date-time":"2024-12-23T07:39:17Z","timestamp":1734939557000},"page":"3059-3078","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":123,"title":["LaVie: High-Quality Video Generation with Cascaded Latent Diffusion Models"],"prefix":"10.1007","volume":"133","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9487-6187","authenticated-orcid":false,"given":"Yaohui","family":"Wang","sequence":"first","affiliation":[]},{"given":"Xinyuan","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Xin","family":"Ma","sequence":"additional","affiliation":[]},{"given":"Shangchen","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Ziqi","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Yi","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ceyuan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yinan","family":"He","sequence":"additional","affiliation":[]},{"given":"Jiashuo","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Peiqing","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Yuwei","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Tianxing","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Chenyang","family":"Si","sequence":"additional","affiliation":[]},{"given":"Yuming","family":"Jiang","sequence":"additional","affiliation":[]},{"given":"Cunjian","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Chen Change","family":"Loy","sequence":"additional","affiliation":[]},{"given":"Bo","family":"Dai","sequence":"additional","affiliation":[]},{"given":"Dahua","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Ziwei","family":"Liu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,12,23]]},"reference":[{"key":"2295_CR1","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G.. & Zisserman, A. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In Proceedings of the IEEE\/CVF international conference on computer vision","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2295_CR2","unstructured":"Balaji, Y., Nah, S., Huang, X., Vahdat, A., Song, J., Kreis, K., Aittala, M., Aila, T., Laine, S., Catanzaro, B., et al. (2022). ediffi: Text-to-image diffusion models with an ensemble of expert denoisers. arXiv preprint arXiv:2211.01324"},{"key":"2295_CR3","doi-asserted-by":"crossref","unstructured":"Bengio, Y., Louradour, J., Collobert, R., & Weston, J. (2009). Curriculum learning. In Proceedings of the 26th annual international conference on machine learning","DOI":"10.1145\/1553374.1553380"},{"key":"2295_CR4","doi-asserted-by":"crossref","unstructured":"Bhagat, S., Uppal, S., Yin, Z. & Lim, N. (2020). Disentangling multiple features in video sequences using gaussian processes in variational autoencoders. In Computer vision\u2013ECCV 2020: 16th European conference, Glasgow, UK, August 23\u201328, 2020, Proceedings, Part XXIII 16","DOI":"10.1007\/978-3-030-58592-1_7"},{"key":"2295_CR5","unstructured":"Blattmann, A., Dockhorn, T., Kulal, S., Mendelevitch, D., Kilian, M., Lorenz, D., Levi, Y., English, Z., Voleti, V., & Letts, A., et al. (2023). Stable video diffusion: Scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127"},{"key":"2295_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, S.W., Fidler, S., & Kreis, K. (2023). Align your latents: High-resolution video synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"2295_CR7","unstructured":"Brock, A., Donahue, J. & Simonyan, K. (2019). Large scale GAN training for high fidelity natural image synthesis. In ICLR"},{"key":"2295_CR8","unstructured":"Brooks, T., Peebles, B., Holmes, C., DePue, W., Guo, Y., Jing, L., Schnurr, D., Taylor, J., Luhman, T., & Luhman, E., et al. (2024). Video generation models as world simulators"},{"key":"2295_CR9","first-page":"31769","volume":"35","author":"T Brooks","year":"2022","unstructured":"Brooks, T., Hellsten, J., Aittala, M., Wang, T.-C., Aila, T., Lehtinen, J., Liu, M.-Y., Efros, A. A., & Karras, T. (2022). Generating long videos of dynamic scenes. Advances in Neural Information Processing Systems, 35, 31769\u201331781.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2295_CR10","doi-asserted-by":"crossref","unstructured":"Carreira, J., & Zisserman, A. (2017). Quo vadis, action recognition? a new model and the kinetics dataset. In Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2017.502"},{"key":"2295_CR11","doi-asserted-by":"crossref","unstructured":"Chan, K.C.K., Zhou, S., Xu, X. & Loy, C.C. (2022). Investigating tradeoffs in real-world video super-resolution. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52688.2022.00587"},{"key":"2295_CR12","doi-asserted-by":"crossref","unstructured":"Chan, K.C.K., Zhou, S., Xu, X., & Loy, C.C. (2022). BasicVSR++: Improving video super-resolution with enhanced propagation and alignment. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52688.2022.00588"},{"key":"2295_CR13","doi-asserted-by":"crossref","unstructured":"Chen, H., Li, J., Frintrop, S., & Hu, X. (2021). The msr-video to text dataset with clean annotations. arXiv preprint arXiv:2102.06448","DOI":"10.1016\/j.cviu.2022.103581"},{"key":"2295_CR14","doi-asserted-by":"crossref","unstructured":"Chen, J., YU, J., GE, C., Yao, L., Xie, E., Wang, Z., Kwok, J., Luo, P., Lu, H., & Li, Z. (2024). Pixart-$$\\alpha $$: Fast training of diffusion transformer for photorealistic text-to-image synthesis. In ICLR","DOI":"10.1007\/978-3-031-73411-3_5"},{"key":"2295_CR15","doi-asserted-by":"crossref","unstructured":"Chen, H., Zhang, Y., Cun, X., Xia, M., Wang, X., Weng, C., & Shan, Y. (2024). Videocrafter2: Overcoming data limitations for high-quality video diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp. 7310\u20137320.","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"2295_CR16","doi-asserted-by":"publisher","first-page":"7090","DOI":"10.1109\/TIP.2020.2998297","volume":"29","author":"X Chen","year":"2020","unstructured":"Chen, X., Xu, C., Yang, X., & Tao, D. (2020). Long-term video prediction via criticization and retrospection. IEEE Transactions on Image Processing, 29, 7090\u20137103.","journal-title":"IEEE Transactions on Image Processing"},{"key":"2295_CR17","unstructured":"Clark, A., Donahue, J., & Simonyan, K. (2019). Adversarial video generation on complex datasets. arXiv preprint arXiv:1907.06571"},{"key":"2295_CR18","unstructured":"Dai, X., Hou, J., Ma, C.-Y., Tsai, S., Wang, J., Wang, R., Zhang, P., Vandenhende, S., Wang, X., & Dubey, A., et al. (2023). Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807"},{"key":"2295_CR19","unstructured":"Esser, P., Kulal, S., Blattmann, A., Entezari, R., M\u00fcller, J., Saini, H., Levi, Y., Lorenz, D., Sauer, A., Boesel, F. & et al. (2024). Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first international conference on machine learning"},{"key":"2295_CR20","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R. & Ommer, B. (2021). Taming transformers for high-resolution image synthesis. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"2295_CR21","doi-asserted-by":"crossref","unstructured":"Ge, S., Hayes, T., Yang, H., Yin, X., Pang, G., Jacobs, D., Huang, J.-B., & Parikh, D. (2022). Long video generation with time-agnostic vqgan and time-sensitive transformer. In European conference on computer vision","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"2295_CR22","doi-asserted-by":"crossref","unstructured":"Ge, S., Nah, S., Liu, G., Poon, T., Tao, A., Catanzaro, B., Jacobs, D., Huang, J.-B., Liu, M.-Y., & Balaji, Y. (2023). Preserve your own correlation: A noise prior for video diffusion models. arXiv preprint arXiv:2305.10474","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"2295_CR23","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. Advances in Neural Information Processing Systems"},{"key":"2295_CR24","unstructured":"Guo, Y., Yang, C., Rao, A., Wang, Y., Qiao, Y., Lin, D., & Dai, B. (2023). Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725"},{"key":"2295_CR25","doi-asserted-by":"crossref","unstructured":"Gupta, A., Yu, L., Sohn, K., Gu, X., Hahn, M., Fei-Fei, L., Essa, I., Jiang, L., & Lezama, J. (2023). Photorealistic video generation with diffusion models. arXiv preprint arXiv:2312.06662","DOI":"10.1007\/978-3-031-72986-7_23"},{"key":"2295_CR26","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., & Chen, Q. (2022). Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:2211.13221"},{"key":"2295_CR27","unstructured":"Ho, J., Chan, W., Saharia, C., Whang, J., Gao, R., Gritsenko, A., Kingma, D.P., Poole, B., Norouzi, M., & Fleet, D.J., et al. (2022). Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303"},{"key":"2295_CR28","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., & Fleet, D.J. (2022). Video diffusion models. arXiv preprint arXiv:2204.03458"},{"key":"2295_CR29","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. NeurIPS, 33, 6840.","journal-title":"NeurIPS"},{"key":"2295_CR30","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., & Tang, J. (2023). Cogvideo: Large-scale pretraining for text-to-video generation via transformers. In ICLR"},{"key":"2295_CR31","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., & Chen, W. (2022). LoRA: Low-rank adaptation of large language models. In ICLR"},{"key":"2295_CR32","doi-asserted-by":"crossref","unstructured":"Huang, Z., Chan, K.C.K., Jiang, Y., & Liu, Z. (2023). Collaborative diffusion for multi-modal face generation and editing. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52729.2023.00589"},{"key":"2295_CR33","doi-asserted-by":"crossref","unstructured":"Huang, Z., He, Y., Yu, J., Zhang, F., Si, C., Jiang, Y., Zhang, Y., Wu, T., Jin, Q., & Chanpaisit, N., et al. (2023). Vbench: Comprehensive benchmark suite for video generative models. arXiv preprint arXiv:2311.17982","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"2295_CR34","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Chan, K.C., Wang, X., Loy, C.C., & Liu, Z. (2021). Robust reference-based super-resolution via c2-matching. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR46437.2021.00214"},{"key":"2295_CR35","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Chan, K.C., Wang, X., Loy, C.C., & Liu, Z. (2022). Reference-based image and video super-resolution via $$c^{2}$$-matching. In IEEE transactions on pattern analysis and machine intelligence","DOI":"10.1109\/TPAMI.2022.3231089"},{"key":"2295_CR36","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Yang, S., Koh, T.L., Wu, W., Loy, C.C., & Liu, Z. (2023). Text2performer: Text-driven human video generation. arXiv preprint arXiv:2303.13495","DOI":"10.1109\/ICCV51070.2023.02079"},{"key":"2295_CR37","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S. & Aila, T. (2019). A style-based generator architecture for generative adversarial networks. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2019.00453"},{"key":"2295_CR38","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aittala, M., Hellsten, J., Lehtinen, J., & Aila, T. (2020). Analyzing and improving the image quality of StyleGAN. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR42600.2020.00813"},{"key":"2295_CR39","unstructured":"Kingma, D.P., & Welling, M. (2014). Auto-encoding variational bayes. In ICLR"},{"key":"2295_CR40","unstructured":"Li, Y., & Mandt, S. (2018). Disentangled sequential autoencoder. In ICML"},{"key":"2295_CR41","unstructured":"Li, K., He, Y., Wang, Y., Li, Y., Wang, W., Luo, P., Wang, Y., Wang, L., & Qiao, Y. (2023). VideoChat: Chat-Centric Video Understanding"},{"key":"2295_CR42","unstructured":"Lu, H., Yang, G., Fei, N., Huo, Y., Lu, Z., Luo, P. & Ding, M. (2023). Vdt: General-purpose video diffusion transformers via mask modeling. In ICLR"},{"key":"2295_CR43","doi-asserted-by":"crossref","unstructured":"Luo, Z., Chen, D., Zhang, Y., Huang, Y., Wang, L., Shen, Y., Zhao, D., Zhou, J., & Tan, T.-P. (2023). Videofusion: Decomposed diffusion models for high-quality video generation. In CVPR","DOI":"10.1109\/CVPR52729.2023.10308948"},{"key":"2295_CR44","unstructured":"Ma, X., Wang, Y., Jia, G., Chen, X., Liu, Z., Li, Y.-F., Chen, C., & Qiao, Y. (2024). Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048"},{"key":"2295_CR45","doi-asserted-by":"crossref","unstructured":"Mokady, R., Hertz, A., Aberman, K., Pritch, Y., & Cohen-Or, D. (2022). Null-text inversion for editing real images using guided diffusion models. arXiv preprint arXiv:2211.09794","DOI":"10.1109\/CVPR52729.2023.00585"},{"key":"2295_CR46","unstructured":"Nichol, A.Q., & Dhariwal, P. (2021). Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning"},{"key":"2295_CR47","doi-asserted-by":"crossref","unstructured":"Parmar, G., Kumar\u00a0Singh, K., Zhang, R., Li, Y., Lu, J., & Zhu, J.-Y. (2023). Zero-shot image-to-image translation. In ACM SIGGRAPH 2023 conference proceedings","DOI":"10.1145\/3588432.3591513"},{"key":"2295_CR48","doi-asserted-by":"crossref","unstructured":"Peebles, W., & Xie, S. (2023). Scalable diffusion models with transformers. In Proceedings of the IEEE\/CVF international conference on computer vision","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"2295_CR49","unstructured":"Radford, A., Kim, J.W., Hallacy, C., Ramesh, A., Goh, G., Agarwal, S., Sastry, G., Askell, A., Mishkin, P., Clark, J., et al. (2021). Learning transferable visual models from natural language supervision. In International conference on machine learning"},{"key":"2295_CR50","unstructured":"Radford, A., Metz, L. & Chintala, S. (2015). Unsupervised representation learning with deep convolutional generative adversarial networks. arXiv preprint arXiv:1511.06434"},{"issue":"140","key":"2295_CR51","first-page":"1","volume":"21","author":"C Raffel","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., & Liu, P. J. (2020). Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21(140), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"key":"2295_CR52","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., & Chen, M. (2022). Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125"},{"key":"2295_CR53","unstructured":"Ramesh, A., Pavlov, M., Goh, G., Gray, S., Voss, C., Radford, A., Chen, M., & Sutskever, I. (2021). Zero-shot text-to-image generation. In International conference on machine learning"},{"key":"2295_CR54","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., & Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2295_CR55","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Chan, W., Saxena, S., Li, L., Whang, J., Denton, E. L., Ghasemipour, K., Gontijo Lopes, R., Karagol Ayan, B., Salimans, T., et al. (2022). Photorealistic text-to-image diffusion models with deep language understanding. Advances in Neural Information Processing Systems, 35, 36479\u201336494.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2295_CR56","first-page":"4713","volume":"45","author":"C Saharia","year":"2022","unstructured":"Saharia, C., Ho, J., Chan, W., Salimans, T., Fleet, D. J., & Norouzi, M. (2022). Image super-resolution via iterative refinement. IEEE Transactions on Pattern Analysis and Machine Intelligence, 45, 4713.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"2295_CR57","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., & Saito, S. (2017). Temporal generative adversarial nets with singular value clipping. In Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2017.308"},{"key":"2295_CR58","first-page":"25278","volume":"35","author":"C Schuhmann","year":"2022","unstructured":"Schuhmann, C., Beaumont, R., Vencu, R., Gordon, C., Wightman, R., Cherti, M., Coombes, T., Katta, A., Mullis, C., Wortsman, M., et al. (2022). Laion-5b: An open large-scale dataset for training next generation image-text models. Advances in Neural Information Processing Systems, 35, 25278\u201325294.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"2295_CR59","unstructured":"Singer, U., Polyak, A., Hayes, T., Yin, X., An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., Parikh, D., Gupta, S., & Taigman, Y. (2023). Make-a-video: Text-to-video generation without text-video data. In ICLR"},{"key":"2295_CR60","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Tulyakov, S., & Elhoseiny, M. (2022). Stylegan-v: A continuous video generator with the price, image quality and perks of stylegan2. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"2295_CR61","unstructured":"Song, J., Meng, C., & Ermon, S. (2021). Denoising diffusion implicit models. In ICLR"},{"key":"2295_CR62","unstructured":"Song, Y., Sohl-Dickstein, J., Kingma, D.P., Kumar, A., Ermon, S., & Poole, B. (2021). Score-based generative modeling through stochastic differential equations. In ICLR"},{"key":"2295_CR63","unstructured":"Soomro, K., Zamir, A.R., & Shah, M. (2012). Ucf101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402"},{"key":"2295_CR64","unstructured":"Su, J., Lu, Y., Pan, S., Murtadha, A., Wen, B., & Liu, Y. (2021). Roformer: Enhanced transformer with rotary position embedding. arXiv preprint arXiv:2104.09864"},{"key":"2295_CR65","unstructured":"Tian, Y., Ren, J., Chai, M., Olszewski, K., Peng, X., Metaxas, D.N., & Tulyakov, S. (2021). A good image generator is what you need for high-resolution video synthesis. In ICLR"},{"key":"2295_CR66","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., & Azhar, F., et al. (2023). Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971"},{"key":"2295_CR67","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.-Y., Yang, X., & Kautz, J. (2018). MoCoGAN: Decomposing motion and content for video generation. In Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2018.00165"},{"key":"2295_CR68","unstructured":"Van Den Oord, A., Vinyals, O., & Kavukcuoglu, K. (2017). Neural discrete representation learning. Advances in neural information processing systems."},{"key":"2295_CR69","unstructured":"Vondrick, C., Pirsiavash, H., & Torralba, A. (2016). Generating videos with scene dynamics. In NeurIPS"},{"key":"2295_CR70","unstructured":"Wang, Y. (2021). Learning to Generate Human Videos. Theses: Inria\u2014Sophia Antipolis; Universit\u00e9 Cote d\u2019Azur"},{"key":"2295_CR71","doi-asserted-by":"crossref","unstructured":"Wang, Y., Bilinski, P., Bremond, F. & Dantcheva, A. (2020). Imaginator: Conditional spatio-temporal gan for video generation. In Proceedings of the IEEE\/CVF winter conference on applications of computer vision","DOI":"10.1109\/WACV45572.2020.9093492"},{"key":"2295_CR72","doi-asserted-by":"crossref","unstructured":"Wang, Y., Bilinski, P., Bremond, F., & Dantcheva, A. (2020). G3AN: Disentangling appearance and motion for video generation. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR42600.2020.00531"},{"key":"2295_CR73","unstructured":"Wang, Y., Bremond, F., & Dantcheva, A. (2021). Inmodegan: Interpretable motion decomposition generative adversarial network for video generation. arXiv preprint arXiv:2101.03049"},{"key":"2295_CR74","doi-asserted-by":"crossref","unstructured":"Wang, Y., Ma, X., Chen, X., Dantcheva, A., Dai, B., & Qiao, Y. (2023). Leo: Generative latent image animator for human video synthesis. arXiv preprint arXiv:2305.03989","DOI":"10.1007\/s11263-024-02231-3"},{"key":"2295_CR75","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., & Zhang, S. (2023). Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571"},{"key":"2295_CR76","doi-asserted-by":"crossref","unstructured":"Wang, J., Yue, Z., Zhou, S., Chan, K.C., & Loy, C.C. (2023). Exploiting diffusion prior for real-world image super-resolution. arXiv preprint arXiv:2305.07015","DOI":"10.1007\/s11263-024-02168-7"},{"key":"2295_CR77","unstructured":"Wu, C., Huang, L., Zhang, Q., Li, B., Ji, L., Yang, F., Sapiro, G., & Duan, N. (2021). Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806"},{"key":"2295_CR78","doi-asserted-by":"crossref","unstructured":"Wu, C., Liang, J., Ji, L., Yang, F., Fang, Y., Jiang, D., & Duan, N. (2022). N\u00fcwa: Visual synthesis pre-training for neural visual world creation. In European conference on computer vision","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"2295_CR79","doi-asserted-by":"crossref","unstructured":"Xie, J., Gao, R., Zheng, Z., Zhu, S.-C. & Wu, Y.N. (2020). Motion-based generator model: Unsupervised disentanglement of appearance, trackable and intrackable motions in dynamic patterns. In Proceedings of the AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v34i07.6931"},{"key":"2295_CR80","unstructured":"Yan, W., Zhang, Y., Abbeel, P., & Srinivas, A. (2021). Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157"},{"key":"2295_CR81","unstructured":"Yang, Z., Teng, J., Zheng, W., Ding, M., Huang, S., Xu, J., Yang, Y., Hong, W., Zhang, X., & Feng, G., et al. (2024). Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072"},{"key":"2295_CR82","unstructured":"Yu, S., Tack, J., Mo, S., Kim, H., Kim, J., Ha, J.-W., & Shin, J. (2022). Generating videos with dynamics-aware implicit generative adversarial networks. In ICLR"},{"key":"2295_CR83","doi-asserted-by":"crossref","unstructured":"Zhang, L., & Agrawala, M. (2023). Adding conditional control to text-to-image diffusion models. arXiv preprint arXiv:2302.05543","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2295_CR84","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., Wu, J.Z., Liu, J.-W., Zhao, R., Ran, L., Gu, Y., Gao, D., & Shou, M.Z. (2023). Show-1: Marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818","DOI":"10.1007\/s11263-024-02271-9"},{"key":"2295_CR85","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xing, Z., Zeng, Y., Fang, Y., & Chen, K. (2024). Pia: Your personalized image animator via plug-and-play modules in text-to-image models. In Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR52733.2024.00740"},{"key":"2295_CR86","unstructured":"Zhang, Q., Yang, C., Shen, Y., Xu, Y., & Zhou, B. (2023). Towards smooth video composition. In ICLR"},{"key":"2295_CR87","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., & Feng, J. (2022). Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018"},{"key":"2295_CR88","unstructured":"Zhou, S., Chan, K., Li, C., & Loy, C. C. (2022). Towards robust blind face restoration with codebook lookup transformer. Advances in Neural Information Processing Systems, 35(2022), 30599\u201330611."},{"key":"2295_CR89","first-page":"3499","volume":"33","author":"S Zhou","year":"2020","unstructured":"Zhou, S., Zhang, J., Zuo, W., & Loy, C. C. (2020). Cross-scale internal graph neural network for image super-resolution. Advances in Neural Information Processing Systems, 33, 3499\u20133509.","journal-title":"Advances in Neural Information Processing Systems"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02295-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-024-02295-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-024-02295-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,4,17]],"date-time":"2025-04-17T06:01:29Z","timestamp":1744869689000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-024-02295-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12,23]]},"references-count":89,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["2295"],"URL":"https:\/\/doi.org\/10.1007\/s11263-024-02295-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12,23]]},"assertion":[{"value":"29 March 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"28 October 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 December 2024","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We acknowledge the ethical concerns that are shared with other T2I and T2V diffusion models. We aim to synthesize high-quality videos by giving text descriptions. Our approach can be used for movie production, making video games, artistic creation, generating synthetic data for other computer vision tasks, etc. We note that our framework has the potential to introduce unintended bias as a result of the training data.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Ethical Approval"}}]}}