{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:56:19Z","timestamp":1777568179326,"version":"3.51.4"},"reference-count":93,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T00:00:00Z","timestamp":1737676800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T00:00:00Z","timestamp":1737676800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1007\/s11263-025-02346-1","type":"journal-article","created":{"date-parts":[[2025,1,24]],"date-time":"2025-01-24T14:25:49Z","timestamp":1737728749000},"page":"3629-3644","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":11,"title":["MoonShot: Towards Controllable Video Generation and Editing with Motion-Aware Multimodal Conditions"],"prefix":"10.1007","volume":"133","author":[{"given":"David Junhao","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dongxu","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hung","family":"Le","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mike Zheng","family":"Shou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Caiming","family":"Xiong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Doyen","family":"Sahoo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,1,24]]},"reference":[{"key":"2346_CR1","unstructured":"An, J., Zhang, S., Yang, H., Gupta, S., Huang, J.B., Luo, J., Yin, X. (2023). Latent-shift: Latent diffusion with temporal shift for efficient text-to-video generation. arXiv preprint arXiv:2304.08477"},{"key":"2346_CR2","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R.H., Levine, S. (2017). Stochastic variational video prediction. arXiv preprint arXiv:1710.11252"},{"key":"2346_CR3","doi-asserted-by":"crossref","unstructured":"Bain, M., Nagrani, A., Varol, G., Zisserman, A. (2021). Frozen in time: A joint video and image encoder for end-to-end retrieval. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 1728\u20131738","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"2346_CR4","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, S.W., Fidler, S., Kreis, K. (2023a). Align your latents: High-resolution video synthesis with latent diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 22563\u201322575","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"2346_CR5","doi-asserted-by":"crossref","unstructured":"Blattmann, A., Rombach, R., Ling, H., Dockhorn, T., Kim, S.W., Fidler, S., Kreis, K. (2023b). Align your latents: High-resolution video synthesis with latent diffusion models. In: CVPR","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"2346_CR6","doi-asserted-by":"crossref","unstructured":"Ceylan, D., Huang, C.H., Mitra, N.J. (2023). Pix2video: Video editing using image diffusion. arXiv:2303.12688","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"2346_CR7","doi-asserted-by":"crossref","unstructured":"Chai, W., Guo, X., Wang, G., Lu, Y. (2023). Stablevideo: Text-driven consistency-aware diffusion video editing. arXiv preprint arXiv:2308.09592","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"2346_CR8","doi-asserted-by":"crossref","unstructured":"Chen, H., Zhang, Y., Cun, X., Xia, M., Wang, X., Weng, C., Shan, Y. (2024). Videocrafter2: Overcoming data limitations for high-quality video diffusion models. arXiv:2401.09047","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"2346_CR9","unstructured":"Chen, L., Zhao, M., Liu, Y., Ding, M., Song, Y., Wang, S., Wang, X., Yang, H., Liu, J., Du, K., et\u00a0al. (2023a). Photoverse: Tuning-free image customization with text-to-image diffusion models. arXiv preprint arXiv:2309.05793"},{"key":"2346_CR10","unstructured":"Chen, W., Wu, J., Xie, P., Wu, H., Li, J., Xia, X., Xiao, X., Lin, L. (2023b). Control-a-video: Controllable text-to-video generation with diffusion models. arXiv:2305.13840"},{"key":"2346_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Huang, L., Liu, Y., Shen, Y., Zhao, D., Zhao, H. (2023c). Anydoor: Zero-shot object-level image customization. arXiv preprint arXiv:2307.09481","DOI":"10.1109\/CVPR52733.2024.00630"},{"key":"2346_CR12","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A. (2023). Structure and content-guided video synthesis with diffusion models. arXiv preprint arXiv:2302.03011","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"2346_CR13","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/978-3-031-19784-0_6","volume-title":"Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings","author":"O Gafni","year":"2022","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., & Taigman, Y. (2022). Make-a-scene: Scene-based text-to-image generation with human priors. In X. V. Part (Ed.), Computer Vision-ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23\u201327, 2022, Proceedings (pp. 89\u2013106). Springer."},{"key":"2346_CR14","doi-asserted-by":"crossref","unstructured":"Ge, S., Hayes, T., Yang, H., Yin, X., Pang, G., Jacobs, D., Huang, J.B., Parikh, D. (2022). Long video generation with time-agnostic vqgan and time-sensitive transformer. arXiv preprint arXiv:2204.03638","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"2346_CR15","doi-asserted-by":"crossref","unstructured":"Ge, S., Nah, S., Liu, G., Poon, T., Tao, A., Catanzaro, B., Jacobs, D., Huang, J.B., Liu, M.Y., Balaji, Y. (2023). Preserve your own correlation: A noise prior for video diffusion models. arXiv preprint arXiv:2305.10474","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"2346_CR16","unstructured":"Geyer, M., Bar-Tal, O., Bagon, S., Dekel, T. (2023). Tokenflow: Consistent diffusion features for consistent video editing. arXiv preprint arxiv:2307.10373"},{"key":"2346_CR17","doi-asserted-by":"crossref","unstructured":"Girdhar, R., Singh, M., Brown, A., Duval, Q., Azadi, S., Rambhatla, S.S., Shah, A., Yin, X., Parikh D., Misra I. (2023). Emu video: Factorizing text-to-video generation by explicit image conditioning. arXiv preprint arXiv:2311.10709","DOI":"10.1007\/978-3-031-73033-7_12"},{"key":"2346_CR18","unstructured":"Gu, Y., Wang, X., Wu, J.Z., Shi, Y., Chen, Y., Fan, Z., Xiao, W., Zhao, R., Chang, S., Wu, W., et\u00a0al. (2023). Mix-of-show: Decentralized low-rank adaptation for multi-concept customization of diffusion models. arXiv preprint arXiv:2305.18292"},{"key":"2346_CR19","unstructured":"Guo, Y., Yang, C., Rao, A., Wang, Y., Qiao, Y., Lin, D., Dai B. (2023). Animatediff: Animate your personalized text-to-image diffusion models without specific tuning. arXiv preprint arXiv:2307.04725"},{"key":"2346_CR20","unstructured":"Harvey, W., Naderiparizi, S., Masrani, V., Weilbach, C., Wood, F. (2022). Flexible diffusion modeling of long videos. arXiv preprint arXiv:2205.11495"},{"key":"2346_CR21","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q. (2022). Latent video diffusion models for high-fidelity video generation with arbitrary lengths. arXiv preprint arXiv:2211.13221"},{"key":"2346_CR22","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S. (2017). Gans trained by a two time-scale update rule converge to a local nash equilibrium. Advances in neural information processing systems 30"},{"key":"2346_CR23","first-page":"6840","volume":"33","author":"J Ho","year":"2020","unstructured":"Ho, J., Jain, A., & Abbeel, P. (2020). Denoising diffusion probabilistic models. Advances in neural information processing systems, 33, 6840\u20136851.","journal-title":"Advances in neural information processing systems"},{"key":"2346_CR24","unstructured":"Ho, J., Chan, W., Saharia, C., Whang, J., Gao, R., Gritsenko, A., Kingma, D.P., Poole, B., Norouzi, M., Fleet, D.J., et\u00a0al. (2022). Imagen video: High definition video generation with diffusion models. arXiv preprint arXiv:2210.02303"},{"key":"2346_CR25","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J. (2022). Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868"},{"key":"2346_CR26","unstructured":"H\u00f6ppe, T., Mehrjou, A., Bauer, S., Nielsen, D., Dittadi, A. (2022). Diffusion models for video prediction and infilling. arXiv preprint arXiv:2206.07696"},{"key":"2346_CR27","unstructured":"Hu, E.J., Shen, Y., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W. (2021). Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685"},{"key":"2346_CR28","doi-asserted-by":"crossref","unstructured":"Hu, J., Shen, L., Sun, G. (2018). Squeeze-and-excitation networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7132\u20137141","DOI":"10.1109\/CVPR.2018.00745"},{"key":"2346_CR29","doi-asserted-by":"crossref","unstructured":"Huang, Z., He, Y., Yu, J., Zhang, F., Si, C., Jiang, Y., Zhang, Y., Wu, T., Jin, Q., Chanpaisit, N., Wang, Y., Chen, X., Wang, L., Lin, D., Qiao, Y., Liu, Z. (2024). VBench: Comprehensive benchmark suite for video generative models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"2346_CR30","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Wu, T., Yang, S., Si, C., Lin, D., Qiao, Y., Loy, C.C., Liu, Z. (2024). Videobooth: Diffusion-based video generation with image prompts. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6689\u20136700","DOI":"10.1109\/CVPR52733.2024.00639"},{"key":"2346_CR31","unstructured":"Karras, T., Aittala, M., Aila, T., Laine, S. (2022). Elucidating the design space of diffusion-based generative models. arXiv preprint arXiv:2206.00364"},{"key":"2346_CR32","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., Movsisyan, A., Tadevosyan, V., Henschel, R., Wang, Z., Navasardyan, S., Shi, H. (2023a). Text2video-zero: Text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"2346_CR33","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., Movsisyan, A., Tadevosyan, V., Henschel, R., Wang, Z., Navasardyan, S., Shi, H. (2023b). Text2video-zero: Text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"2346_CR34","doi-asserted-by":"crossref","unstructured":"Kim, Y., Nam, S., Cho, I., Kim, S.J. (2019). Unsupervised keypoint learning for guiding class-conditional video prediction. Advances in neural information processing systems 32","DOI":"10.1186\/s13640-019-0478-8"},{"key":"2346_CR35","doi-asserted-by":"crossref","unstructured":"Kumari, N., Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y. (2023a). Multi-concept customization of text-to-image diffusion. In: CVPR","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"2346_CR36","doi-asserted-by":"crossref","unstructured":"Kumari, N,, Zhang, B., Zhang, R., Shechtman, E., Zhu, J.Y. (2023b). Multi-concept customization of text-to-image diffusion. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 1931\u20131941","DOI":"10.1109\/CVPR52729.2023.00192"},{"key":"2346_CR37","unstructured":"Le\u00a0Moing, G., Ponce, J., Schmid, C. (2021). Ccvs: Context-aware controllable video synthesis. NeurIPS"},{"key":"2346_CR38","unstructured":"Li, D., Li, J., Hoi, S.C. (2023a). Blip-diffusion: Pre-trained subject representation for controllable text-to-image generation and editing. arXiv preprint arXiv:2305.14720"},{"key":"2346_CR39","unstructured":"Li, X., Chu, W., Wu, Y., Yuan, W., Liu, F., Zhang, Q., Li, F., Feng, H., Ding, E., Wang, J. (2023b). Videogen: A reference-guided latent diffusion approach for high definition text-to-video generation. arXiv preprint arXiv:2309.00398"},{"key":"2346_CR40","doi-asserted-by":"crossref","unstructured":"Li, Y., Fang, C., Yang, J., Wang, Z., Lu, X., Yang, M.H. (2018). Flow-grounded spatial-temporal video prediction from still images. In: Proceedings of the European Conference on Computer Vision (ECCV), pp 600\u2013615","DOI":"10.1007\/978-3-030-01240-3_37"},{"key":"2346_CR41","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhang, Y., Li, W., Lin, Z., Jia, J. (2023). Video-p2p: Video editing with cross-attention control. arXiv:2303.04761","DOI":"10.1109\/CVPR52733.2024.00821"},{"key":"2346_CR42","doi-asserted-by":"crossref","unstructured":"Luo, Z., Chen, D., Zhang, Y., Huang, Y., Wang, L., Shen, Y., Zhao, D., Zhou, J., Tan, T. (2023). Videofusion: Decomposed diffusion models for high-quality video generation. In: CVPR","DOI":"10.1109\/CVPR52729.2023.10308948"},{"key":"2346_CR43","unstructured":"Ma, Z., Zhou, D., Yeh, C.H., Wang, X.S., Li, X., Yang, H., Dong, Z., Keutzer, K., Feng, J. (2024). Magic-me: Identity-specific video customized diffusion. arXiv preprint arXiv:2402.09368"},{"key":"2346_CR44","unstructured":"Molad, E., Horwitz, E., Valevski, D., Acha, A.R., Matias, Y., Pritch, Y., Leviathan, Y., Hoshen, Y. (2023). Dreamix: Video diffusion models are general video editors. arXiv preprint arXiv:2302.01329"},{"key":"2346_CR45","doi-asserted-by":"crossref","unstructured":"Mou, C., Wang, X., Xie, L., Zhang, J., Qi, Z., Shan, Y., Qie, X. (2023). T2i-adapter: Learning adapters to dig out more controllable ability for text-to-image diffusion models. arXiv preprint arXiv:2302.08453","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"2346_CR46","doi-asserted-by":"crossref","unstructured":"Ni, H., Shi, C., Li, K., Huang, S.X., Min, M.R. (2023). Conditional image-to-video generation with latent flow diffusion models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 18444\u201318455","DOI":"10.1109\/CVPR52729.2023.01769"},{"key":"2346_CR47","unstructured":"Nikankin, Y., Haim, N., Irani, M. (2022). Sinfusion: Training diffusion models on a single image or video. arXiv preprint arXiv:2211.11743"},{"key":"2346_CR48","unstructured":"Oquab, M., Darcet, T., Moutakanni, T., Vo, H., Szafraniec, M., Khalidov, V., Fernandez, P., Haziza, D., Massa, F., El-Nouby, A., et\u00a0al. (2023). Dinov2: Learning robust visual features without supervision. arXiv preprint arXiv:2304.07193"},{"key":"2346_CR49","doi-asserted-by":"crossref","unstructured":"Pan, J., Wang, C., Jia, X., Shao, J., Sheng, L., Yan, J., Wang, X. (2019). Video generation from single semantic label map. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 3733\u20133742","DOI":"10.1109\/CVPR.2019.00385"},{"key":"2346_CR50","unstructured":"Podell, D., English, Z., Lacey, K., Blattmann, A., Dockhorn, T., M\u00fcller, J., Penna, J., Rombach, R. (2023). Sdxl: Improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952"},{"key":"2346_CR51","unstructured":"Pont-Tuset, J., Perazzi, F., Caelles, S., Arbel\u00e1ez, P., Sorkine-Hornung, A., Van\u00a0Gool, L. (2017). The 2017 davis challenge on video object segmentation. arXiv preprint arXiv:1704.00675"},{"key":"2346_CR52","doi-asserted-by":"crossref","unstructured":"Qi, C., Cun, X., Zhang, Y., Lei, C., Wang, X., Shan, Y., Chen, Q. (2023). Fatezero: Fusing attentions for zero-shot text-based video editing. arXiv preprint arXiv:2303.09535","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"2346_CR53","unstructured":"Ren, W., Yang, H., Zhang, G., Wei, C., Du, X., Huang, S., Chen, W. (2024). Consisti2v: Enhancing visual consistency for image-to-video generation. arXiv preprint arXiv:2402.04324"},{"key":"2346_CR54","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B. (2022). High-resolution image synthesis with latent diffusion models. In: CVPR, pp 10684\u201310695","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"2346_CR55","doi-asserted-by":"crossref","unstructured":"Ruiz, N., Li, Y., Jampani, V., Pritch, Y., Rubinstein, M., Aberman, K. (2022). Dreambooth: Fine tuning text-to-image diffusion models for subject-driven generation. arXiv preprint arXiv:2208.12242","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"2346_CR56","doi-asserted-by":"crossref","unstructured":"Saito, M., Matsumoto, E., Saito, S. (2017). Temporal generative adversarial nets with singular value clipping. In: ICCV","DOI":"10.1109\/ICCV.2017.308"},{"key":"2346_CR57","unstructured":"Schuhmann, C., Vencu, R., Beaumont, R., Coombes, T., Gordon, C., Katta, A., Kaczmarczyk, R., Jitsev, J. (2022). LAION-5B: laion-5b: A new era of open large-scale multi-modal datasets. https:\/\/laion.ai\/laion-5b-a-new-era-of-open-large-scale-multi-modal-datasets\/"},{"key":"2346_CR58","doi-asserted-by":"crossref","unstructured":"Shen, X., Li, X., Elhoseiny, M. (2023). Mostgan-v: Video generation with temporal motion styles. In: CVPR","DOI":"10.1109\/CVPR52729.2023.00547"},{"key":"2346_CR59","unstructured":"Singer, U., Polyak, A., Hayes, T,, Yin X,, An, J., Zhang, S., Hu, Q., Yang, H., Ashual, O., Gafni, O., et\u00a0al. (2022). Make-a-video: Text-to-video generation without text-video data. arXiv preprint arXiv:2209.14792"},{"key":"2346_CR60","doi-asserted-by":"crossref","unstructured":"Skorokhodov, I., Tulyakov, S., Elhoseiny, M. (2021). Stylegan-v: A continuous video generator with the price, image quality and perks of stylegan2. arXiv preprint arXiv:2112.14683","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"2346_CR61","unstructured":"Smith, J.S., Hsu, Y.C., Zhang, L., Hua, T., Kira, Z., Shen, Y., Jin, H. (2023). Continual diffusion: Continual customization of text-to-image diffusion with c-lora. arXiv preprint arXiv:2304.06027"},{"key":"2346_CR62","unstructured":"Srivastava, N., Mansimov, E., Salakhudinov, R. (2015). Unsupervised learning of video representations using lstms. In: ICML"},{"key":"2346_CR63","unstructured":"Tian, Y., Ren, J., Chai, M., Olszewski, K., Peng, X., Metaxas, D.N., Tulyakov, S. (2021). A good image generator is what you need for high-resolution video synthesis. In: ICLR"},{"key":"2346_CR64","doi-asserted-by":"crossref","unstructured":"Tulyakov, S., Liu, M.Y., Yang, X., Kautz, J. (2018). Mocogan: Decomposing motion and content for video generation. In: CVPR","DOI":"10.1109\/CVPR.2018.00165"},{"key":"2346_CR65","unstructured":"Unterthiner, T., Van\u00a0Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S. (2018). Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717"},{"key":"2346_CR66","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C. (2022). Masked conditional video diffusion for prediction, generation, and interpolation. arXiv preprint arXiv:2205.09853"},{"key":"2346_CR67","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A. (2016). Generating videos with scene dynamics. NIPS"},{"key":"2346_CR68","unstructured":"Voynov, A., Chu, Q., Cohen-Or, D., Aberman, K. (2023). $$ p+ $$: Extended textual conditioning in text-to-image generation. arXiv preprint arXiv:2303.09522"},{"key":"2346_CR69","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S. (2023a). Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571"},{"key":"2346_CR70","unstructured":"Wang, W., Xie, k., Liu, Z., Chen, H., Cao, Y., Wang, X., Shen, C. (2023b). Zero-shot video editing using off-the-shelf image diffusion models. arXiv preprint arXiv:2303.17599"},{"key":"2346_CR71","unstructured":"Wang, W., Yang, H., Tuo, Z., He, H., Zhu, J., Fu, J., Liu, J. (2023c). Videofactory: Swap attention in spatiotemporal diffusions for text-to-video generation. arXiv preprint arXiv:2305.10874"},{"key":"2346_CR72","unstructured":"Wang, X., Yuan, H., Zhang, S., Chen, D., Wang, J., Zhang, Y., Shen, Y., Zhao, D., Zhou, J. (2023d). Videocomposer: Compositional video synthesis with motion controllability. arXiv preprint arXiv:2306.02018"},{"key":"2346_CR73","doi-asserted-by":"crossref","unstructured":"Wang, X., Zhang, S., Yuan, H., Qing, Z., Gong, B., Zhang, Y., Shen, Y., Gao, C., Sang, N. (2024). A recipe for scaling up text-to-video generation with text-free videos. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6572\u20136582","DOI":"10.1109\/CVPR52733.2024.00628"},{"key":"2346_CR74","unstructured":"Wang, Y., Li, K., Li, Y., He, Y., Huang. B., Zhao. Z., Zhang, H., Xu, J., Liu, Y., Wang, Z., Xing, S., Chen, G., Pan, J., Yu, J., Wang, Y., Wang, L., Qiao, Y. (2022). Internvideo: General video foundation models via generative and discriminative learning. arXiv preprint arXiv:2212.03191"},{"key":"2346_CR75","unstructured":"Wang, Y., Chen, X., Ma, X., Zhou, S., Huang, Z., Wang, Y., Yang, C., He, Y., Yu, J., Yang, P., et\u00a0al. (2023e). Lavie: High-quality video generation with cascaded latent diffusion models. arXiv preprint arXiv:2309.15103"},{"key":"2346_CR76","doi-asserted-by":"crossref","unstructured":"Wei, Y., Zhang, Y., Ji, Z., Bai, J., Zhang, L., Zuo, W. (2023). Elite: Encoding visual concepts into textual embeddings for customized text-to-image generation. arXiv preprint arXiv:2302.13848","DOI":"10.1109\/ICCV51070.2023.01461"},{"key":"2346_CR77","unstructured":"Wu, C., Huang, L., Zhang, Q., Li, B., Ji, L., Yang, F., Sapiro, G., Duan, N. (2021). Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806"},{"key":"2346_CR78","doi-asserted-by":"crossref","unstructured":"Wu, C., Liang J., Ji, L., Yang, F., Fang, Y., Jiang, D., Duan, N. (2022a). N\u00fcwa: Visual synthesis pre-training for neural visual world creation. In: ECCV, Springer, pp 720\u2013736","DOI":"10.1007\/978-3-031-19787-1_41"},{"key":"2346_CR79","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., Ge, Y., Wang, X., Lei, W., Gu, Y., Hsu, W., Shan, Y., Qie, X., Shou, M.Z. (2022b). Tune-a-video: One-shot tuning of image diffusion models for text-to-video generation. arXiv preprint arXiv:2212.11565","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"2346_CR80","doi-asserted-by":"crossref","unstructured":"Xing, J., Xia, M., Zhang, Y., Chen, H., Wang, X., Wong, T.T., Shan, Y. (2023). Dynamicrafter: Animating open-domain images with video diffusion priors. arXiv preprint arXiv:2310.12190","DOI":"10.1007\/978-3-031-72952-2_23"},{"key":"2346_CR81","doi-asserted-by":"crossref","unstructured":"Xiong, W., Luo, W., Ma, L., Liu, W., & Luo, J. (2018). Learning to generate time-lapse videos using multi-stage dynamic generative adversarial networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 2364\u20132373","DOI":"10.1109\/CVPR.2018.00251"},{"key":"2346_CR82","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y. (2016). Msr-vtt: A large video description dataset for bridging video and language. In: CVPR","DOI":"10.1109\/CVPR.2016.571"},{"key":"2346_CR83","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A. (2021). Videogpt: Video generation using vq-vae and transformers. arXiv preprint arXiv:2104.10157"},{"key":"2346_CR84","doi-asserted-by":"crossref","unstructured":"Yang, R., Srivastava, P., Mandt, S. (2022). Diffusion probabilistic modeling for video generation. arXiv preprint arXiv:2203.09481","DOI":"10.3390\/e25101469"},{"key":"2346_CR85","doi-asserted-by":"crossref","unstructured":"Yang, S., Zhou, Y., Liu, Z., Loy, C.C. (2023). Rerender a video: Zero-shot text-guided video-to-video translation. In: ACM SIGGRAPH Asia Conference Proceedings","DOI":"10.1145\/3610548.3618160"},{"key":"2346_CR86","unstructured":"Ye, H., Zhang, J., Liu, S., Han, X., Yang, W. (2023). Ip-adapter: Text compatible image prompt adapter for text-to-image diffusion models. arXiv preprint arxiv:2308.06721"},{"key":"2346_CR87","unstructured":"Yu, S., Tack, J., Mo, S., Kim, H., Kim, J., Ha, J.W., & Shin, J. (2021). Generating videos with dynamics-aware implicit generative adversarial networks. In: ICLR"},{"key":"2346_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, D.J., Wu, J.Z., Liu, J.W., Zhao, R., Ran, L., Gu, Y., Gao, D., & Shou, M.Z. (2023a). Show-1: Marrying pixel and latent diffusion models for text-to-video generation. arXiv preprint arXiv:2309.15818","DOI":"10.1007\/s11263-024-02271-9"},{"key":"2346_CR89","doi-asserted-by":"crossref","unstructured":"Zhang, L., Rao, A., & Agrawala, M. (2023b). Adding conditional control to text-to-image diffusion models","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"2346_CR90","unstructured":"Zhang, S., Wang, J., Zhang, Y., Zhao, K., Yuan, H., Qing, Z., Wang, X., Zhao, D., & Zhou, J. (2023c). I2vgen-xl: High-quality image-to-video synthesis via cascaded diffusion models. arXiv preprint arXiv:2311.04145"},{"key":"2346_CR91","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q. (2023d). Controlvideo: Training-free controllable text-to-video generation. arXiv preprint arXiv:2305.13077"},{"key":"2346_CR92","unstructured":"Zheng, Z., Peng, X., Yang, T., Shen, C., Li, S., Liu, H., Zhou, Y., & Li, T., You Y. (2024). Open-sora: Democratizing efficient video production for all. https:\/\/github.com\/hpcaitech\/Open-Sora"},{"key":"2346_CR93","unstructured":"Zhou, D., Wang, W., Yan, H., Lv. W., Zhu, Y., & Feng, J. (2022). Magicvideo: Efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02346-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-025-02346-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-025-02346-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,10]],"date-time":"2025-05-10T06:56:46Z","timestamp":1746860206000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-025-02346-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,24]]},"references-count":93,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2025,6]]}},"alternative-id":["2346"],"URL":"https:\/\/doi.org\/10.1007\/s11263-025-02346-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,24]]},"assertion":[{"value":"1 April 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"6 January 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"24 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}