{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,26]],"date-time":"2026-03-26T04:13:06Z","timestamp":1774498386690,"version":"3.50.1"},"publisher-location":"Cham","reference-count":91,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031730320","type":"print"},{"value":"9783031730337","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:00:00Z","timestamp":1730332800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-73033-7_12","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:03:55Z","timestamp":1730333035000},"page":"205-224","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":75,"title":["Factorizing Text-to-Video Generation by\u00a0Explicit Image Conditioning"],"prefix":"10.1007","author":[{"given":"Rohit","family":"Girdhar","sequence":"first","affiliation":[]},{"given":"Mannat","family":"Singh","sequence":"additional","affiliation":[]},{"given":"Andrew","family":"Brown","sequence":"additional","affiliation":[]},{"given":"Quentin","family":"Duval","sequence":"additional","affiliation":[]},{"given":"Samaneh","family":"Azadi","sequence":"additional","affiliation":[]},{"given":"Sai Saketh","family":"Rambhatla","sequence":"additional","affiliation":[]},{"given":"Akbar","family":"Shah","sequence":"additional","affiliation":[]},{"given":"Xi","family":"Yin","sequence":"additional","affiliation":[]},{"given":"Devi","family":"Parikh","sequence":"additional","affiliation":[]},{"given":"Ishan","family":"Misra","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,10,31]]},"reference":[{"key":"12_CR1","unstructured":"Aghajanyan, A., et al.: Cm3: a causal masked multimodal model of the internet. arXiv:abs\/2201.07520 (2022)"},{"key":"12_CR2","doi-asserted-by":"publisher","unstructured":"Aldausari, N., Sowmya, A., Marcus, N., Mohammadi, G.: Video generative adversarial networks: a review. ACM Comput. Surv. 55(2) (2022). https:\/\/doi.org\/10.1145\/3487891, https:\/\/doi.org\/10.1145\/3487891","DOI":"10.1145\/3487891"},{"key":"12_CR3","unstructured":"An, J., et al.: Latent-shift: Latent diffusion with temporal shift for efficient text-to-video generation (2023)"},{"key":"12_CR4","unstructured":"Babaeizadeh, M., Finn, C., Erhan, D., Campbell, R.H., Levine, S.: Stochastic variational video prediction. In: ICLR (2018). https:\/\/openreview.net\/forum?id=rk49Mg-CW"},{"key":"12_CR5","unstructured":"Babaeizadeh, M., Saffar, M.T., Nair, S., Levine, S., Finn, C., Erhan, D.: FITVID: overfitting in pixel-level video prediction. arXiv preprint arXiv:2106.13195 (2020)"},{"key":"12_CR6","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: 2023 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp. 22563\u201322575 (2023). https:\/\/api.semanticscholar.org\/CorpusID:258187553","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"12_CR7","unstructured":"Blattmann, A., et\u00a0al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"12_CR8","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. In: International Conference on Learning Representations (2019). https:\/\/openreview.net\/forum?id=B1xsqj09Fm"},{"key":"12_CR9","unstructured":"Brooks, T., et al.: Generating long videos of dynamic scenes. In: NeurIPS (2022)"},{"key":"12_CR10","doi-asserted-by":"crossref","unstructured":"Brooks, T., Holynski, A., Efros, A.A.: Instructpix2pix: learning to follow image editing instructions. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01764"},{"key":"12_CR11","unstructured":"Brown, T.B., et\u00a0al.: Language models are few-shot learners. preprint arXiv:2005.14165 (2020)"},{"key":"12_CR12","unstructured":"Chen, H., et al.: Videocrafter1: Open diffusion models for high-quality video generation. arXiv:2310.19512 (2023)"},{"key":"12_CR13","unstructured":"Chen, T.: On the importance of noise scheduling for diffusion models. arXiv preprint arXiv:2301.10972 (2023)"},{"key":"12_CR14","unstructured":"Chen, W., et al.: Control-a-video: controllable text-to-video generation with diffusion models. arXiv preprint arXiv:2305.13840 (2023)"},{"key":"12_CR15","unstructured":"Chung, H.W., et\u00a0al.: Scaling instruction-finetuned language models. arXiv preprint arXiv:2210.11416 (2022)"},{"key":"12_CR16","unstructured":"Clark, A., Donahue, J., Simonyan, K.: Adversarial video generation on complex datasets (2019)"},{"key":"12_CR17","unstructured":"Dai, X., et\u00a0al.: Emu: Enhancing image generation models using photogenic needles in a haystack. arXiv preprint arXiv:2309.15807 (2023)"},{"key":"12_CR18","unstructured":"Denton, E., Fergus, R.: Stochastic video generation with a learned prior. In: Dy, J., Krause, A. (eds.) Proceedings of the 35th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a080, pp. 1174\u20131183. PMLR (2018). https:\/\/proceedings.mlr.press\/v80\/denton18a.html"},{"key":"12_CR19","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis (2021)"},{"key":"12_CR20","unstructured":"Ding, M., Zheng, W., Hong, W., Tang, J.: Cogview2: faster and better text-to-image generation via hierarchical transformers. In: NeurIPS (2022)"},{"key":"12_CR21","unstructured":"Donahue, J., Krahenb\u00fchl, P., Darrell, T.: Adversarial feature learning. In: ICLR (2016)"},{"key":"12_CR22","doi-asserted-by":"crossref","unstructured":"Esser, P., Chiu, J., Atighehchian, P., Granskog, J., Germanidis, A.: Structure and content-guided video synthesis with diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"12_CR23","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"12_CR24","doi-asserted-by":"crossref","unstructured":"Fei, H., Wu, S., Ji, W., Zhang, H., Chua, T.S.: Empowering dynamics-aware text-to-video diffusion with large language models (2023)","DOI":"10.1109\/CVPR52733.2024.00730"},{"key":"12_CR25","unstructured":"Finn, C., Goodfellow, I., Levine, S.: Unsupervised learning for physical interaction through video prediction. In: Proceedings of the 30th International Conference on Neural Information Processing Systems. NIPS\u201916, Red Hook, NY, USA, pp. 64\u201372. Curran Associates Inc. (2016)"},{"issue":"3","key":"12_CR26","doi-asserted-by":"publisher","first-page":"613","DOI":"10.1177\/001316447303300309","volume":"33","author":"JL Fleiss","year":"1973","unstructured":"Fleiss, J.L., Cohen, J.: The equivalence of weighted kappa and the intraclass correlation coefficient as measures of reliability. Educ. Psychol. Measur. 33(3), 613\u2013619 (1973)","journal-title":"Educ. Psychol. Measur."},{"key":"12_CR27","doi-asserted-by":"crossref","unstructured":"Fu, T.J., et al.: Tell me what happened: Unifying text-guided video completion via multimodal masked video generation. In: CVPR, pp. 10681\u201310692 (2023)","DOI":"10.1109\/CVPR52729.2023.01029"},{"key":"12_CR28","doi-asserted-by":"crossref","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. arXiv preprint arXiv:2203.13131 (2022)","DOI":"10.1007\/978-3-031-19784-0_6"},{"key":"12_CR29","series-title":"LNCS","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/978-3-031-19784-0_6","volume-title":"ECCV 2022","author":"O Gafni","year":"2022","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13675, pp. 89\u2013106. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_6"},{"key":"12_CR30","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Preserve your own correlation: a noise prior for video diffusion models (2023)","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"12_CR31","unstructured":"Gu, J., et al.: Reuse and diffuse: iterative denoising for text-to-video generation (2023)"},{"key":"12_CR32","unstructured":"Gupta, A., Tian, S., Zhang, Y., Wu, J., Mart\u00edn-Mart\u00edn, R., Fei-Fei, L.: Maskvit: masked visual pre-training for video prediction. In: ICLR (2023). https:\/\/openreview.net\/forum?id=QAV2CcLEDh"},{"key":"12_CR33","unstructured":"Harvey, W., Naderiparizi, S., Masrani, V., Weilbach, C., Wood, F.: Flexible diffusion modeling of long videos. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) NeurIPS, vol.\u00a035, pp. 27953\u201327965. Curran Associates, Inc. (2022). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/b2fe1ee8d936ac08dd26f2ff58986c8f-Paper-Conference.pdf"},{"key":"12_CR34","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: Latent video diffusion models for high-fidelity long video generation (2023)"},{"key":"12_CR35","unstructured":"Ho, J., et al.: Imagen video: High definition video generation with diffusion models (2022)"},{"key":"12_CR36","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. arXiv preprint arxiv:2006.11239 (2020)"},{"key":"12_CR37","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. arXiv preprint arXiv:2106.15282 (2021)"},{"key":"12_CR38","unstructured":"Ho, J., Salimans, T.: Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)"},{"key":"12_CR39","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. In: Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., Oh, A. (eds.) NeurIPS, vol.\u00a035, pp. 8633\u20138646. Curran Associates, Inc. (2022). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/39235c56aef13fb05a6adc95eb9d8d66-Paper-Conference.pdf"},{"key":"12_CR40","unstructured":"Hong, S., Seo, J., Hong, S., Shin, H., Kim, S.: Large language models are frame-level directors for zero-shot text-to-video generation (2023)"},{"key":"12_CR41","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: Cogvideo: Large-scale pretraining for text-to-video generation via transformers (2022)"},{"key":"12_CR42","unstructured":"Kalchbrenner, N., et al.: Video pixel networks. In: Precup, D., Teh, Y.W. (eds.) Proceedings of the 34th International Conference on Machine Learning. Proceedings of Machine Learning Research, vol.\u00a070, pp. 1771\u20131779. PMLR (2017). https:\/\/proceedings.mlr.press\/v70\/kalchbrenner17a.html"},{"key":"12_CR43","doi-asserted-by":"crossref","unstructured":"Kang, M., et al.: Scaling up GANs for text-to-image synthesis. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2023)","DOI":"10.1109\/CVPR52729.2023.00976"},{"key":"12_CR44","doi-asserted-by":"crossref","unstructured":"Khachatryan, L., et al.: Text2video-zero: text-to-image diffusion models are zero-shot video generators. arXiv preprint arXiv:2303.13439 (2023)","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"12_CR45","volume-title":"Variational Temporal Abstraction","author":"T Kim","year":"2019","unstructured":"Kim, T., Ahn, S., Bengio, Y.: Variational Temporal Abstraction. Curran Associates Inc., Red Hook (2019)"},{"key":"12_CR46","unstructured":"Kumar, M., et al.: Videoflow: a conditional flow-based model for stochastic video generation. In: ICLR (2020). https:\/\/openreview.net\/forum?id=rJgUfTEYvH"},{"key":"12_CR47","unstructured":"Labs, P.: Pika labs. https:\/\/www.pika.art\/"},{"key":"12_CR48","doi-asserted-by":"crossref","unstructured":"Laptev, I., Lindeberg, T.: Space-time interest points. In: ICCV (2003)","DOI":"10.1109\/ICCV.2003.1238378"},{"key":"12_CR49","unstructured":"Lee, S., Kong, C., Jeon, D., Kwak, N.: Aadiff: Audio-aligned video synthesis with text-to-image diffusion (2023)"},{"key":"12_CR50","unstructured":"Lian, L., Shi, B., Yala, A., Darrell, T., Li, B.: LLM-grounded video diffusion models. arXiv preprint arXiv:2309.17444 (2023)"},{"key":"12_CR51","doi-asserted-by":"crossref","unstructured":"Lin, S., Liu, B., Li, J., Yang, X.: Common diffusion noise schedules and sample steps are flawed. arXiv preprint arXiv:2305.08891 (2023)","DOI":"10.1109\/WACV57701.2024.00532"},{"key":"12_CR52","unstructured":"Loshchilov, I., Hutter, F.: Decoupled weight decay regularization. arXiv preprint arXiv:1711.05101 (2017)"},{"key":"12_CR53","unstructured":"Mathieu, M., Couprie, C., LeCun, Y.: Deep multi-scale video prediction beyond mean square error (2016)"},{"key":"12_CR54","unstructured":"ML, R.: Gen2. https:\/\/research.runwayml.com\/gen2"},{"key":"12_CR55","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models. arXiv preprint arXiv:2112.10741 (2021)"},{"key":"12_CR56","unstructured":"Nichol, A., et al.: Glide: towards photorealistic image generation and editing with text-guided diffusion models (2022)"},{"key":"12_CR57","unstructured":"Podell, D., et al.: SDXL: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952 (2023)"},{"key":"12_CR58","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision (2021)"},{"key":"12_CR59","unstructured":"Ramesh, A., Dhariwal, P., Nichol, A., Chu, C., Chen, M.: Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 (2022)"},{"key":"12_CR60","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation (2021)"},{"key":"12_CR61","unstructured":"Ranzato, M., Szlam, A., Bruna, J., Mathieu, M., Collobert, R., Chopra, S.: Video (language) modeling: a baseline for generative models of natural videos. arXiv:abs\/1412.6604 (2014). https:\/\/api.semanticscholar.org\/CorpusID:17572062"},{"key":"12_CR62","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models (2021)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"12_CR63","doi-asserted-by":"crossref","unstructured":"Saharia, C., et al.: Photorealistic text-to-image diffusion models with deep language understanding (2022)","DOI":"10.1145\/3528233.3530757"},{"key":"12_CR64","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: NeurIPS, vol. 29 (2016)"},{"key":"12_CR65","unstructured":"Salimans, T., Ho, J.: Progressive distillation for fast sampling of diffusion models (2022)"},{"key":"12_CR66","unstructured":"Sauer, A., Karras, T., Laine, S., Geiger, A., Aila, T.: StyleGAN-T: unlocking the power of GANs for fast large-scale text-to-image synthesis. vol. abs\/2301.09515 (2023)"},{"key":"12_CR67","unstructured":"Shi, X., Chen, Z., Wang, H., Yeung, D.Y., Wong, W.K., WOO, W.C.: Convolutional LSTM network: a machine learning approach for precipitation nowcasting. In: Cortes, C., Lawrence, N., Lee, D., Sugiyama, M., Garnett, R. (eds.) NeurIPS. vol.\u00a028. Curran Associates, Inc. (2015). https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2015\/file\/07563a3fe3bbe7e3ba84431ad9d055af-Paper.pdf"},{"key":"12_CR68","unstructured":"Singer, U., et al.: Make-a-video: text-to-video generation without text-video data. In: ICLR (2023). https:\/\/openreview.net\/forum?id=nJfylDvgzlq"},{"key":"12_CR69","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: Bach, F., Blei, D. (eds.) Proceedings of the 32nd International Conference on Machine Learning. Proceedings of Machine Learning Research, Lille, France, vol.\u00a037, pp. 2256\u20132265. PMLR (2015). https:\/\/proceedings.mlr.press\/v37\/sohl-dickstein15.html"},{"key":"12_CR70","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv:2010.02502 (2020). https:\/\/arxiv.org\/abs\/2010.02502"},{"key":"12_CR71","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human action classes from videos in the wild. CRCV-TR-12-01 (2012)"},{"key":"12_CR72","unstructured":"Tang, Z., Yang, Z., Zhu, C., Zeng, M., Bansal, M.: Any-to-any generation via composable diffusion (2023)"},{"key":"12_CR73","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: FVD: a new metric for video generation (2019)"},{"key":"12_CR74","unstructured":"Villegas, R., et al.: Phenaki: variable length video generation from open domain textual descriptions. In: International Conference on Learning Representations (2023). https:\/\/openreview.net\/forum?id=vOEXS39nOF"},{"key":"12_CR75","unstructured":"Voleti, V., Jolicoeur-Martineau, A., Pal, C.: MCVD - masked conditional video diffusion for prediction, generation, and interpolation. In: Oh, A.H., Agarwal, A., Belgrave, D., Cho, K. (eds.) NeurIPS (2022)"},{"key":"12_CR76","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Generating videos with scene dynamics. In: Lee, D.D., Sugiyama, M., von Luxburg, U., Guyon, I., Garnett, R. (eds.) Advances in Neural Information Processing Systems 29: Annual Conference on Neural Information Processing Systems 2016, December 5-10, 2016, Barcelona, Spain, pp. 613\u2013621 (2016). https:\/\/proceedings.neurips.cc\/paper\/2016\/hash\/04025959b191f8f9de3f924f0940515f-Abstract.html"},{"key":"12_CR77","unstructured":"Wang, J., Yuan, H., Chen, D., Zhang, Y., Wang, X., Zhang, S.: Modelscope text-to-video technical report. arXiv preprint arXiv:2308.06571 (2023)"},{"key":"12_CR78","unstructured":"Wang, X., et al.: Videocomposer: compositional video synthesis with motion controllability. arXiv preprint arXiv:2306.02018 (2023)"},{"key":"12_CR79","unstructured":"Wichers, N., Villegas, R., Erhan, D., Lee, H.: Hierarchical long-term video prediction without supervision. In: International Conference on Machine Learning (2018). https:\/\/api.semanticscholar.org\/CorpusID:49193136"},{"key":"12_CR80","unstructured":"Wu, C., et al.: Godiva: generating open-domain videos from natural descriptions. arXiv:abs\/2104.14806 (2021). https:\/\/api.semanticscholar.org\/CorpusID:233476314"},{"key":"12_CR81","doi-asserted-by":"crossref","unstructured":"Wu, J.Z., et al.: Tune-a-video: one-shot tuning of image diffusion models for text-to-video generation. In: ICCV (2023)","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"12_CR82","doi-asserted-by":"crossref","unstructured":"Xing, Z., Dai, Q., Hu, H., Wu, Z., Jiang, Y.G.: SIMDA: simple diffusion adapter for efficient video generation (2023)","DOI":"10.1109\/CVPR52733.2024.00748"},{"key":"12_CR83","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers (2021)"},{"key":"12_CR84","doi-asserted-by":"crossref","unstructured":"Yang, R., Srivastava, P., Mandt, S.: Diffusion probabilistic modeling for video generation. arXiv preprint arXiv:2203.09481 (2022)","DOI":"10.3390\/e25101469"},{"key":"12_CR85","unstructured":"Yin, S., et al.: DragNUWA: fine-grained control in video generation by integrating text, image, and trajectory. arXiv preprint arXiv:2308.08089 (2023)"},{"key":"12_CR86","doi-asserted-by":"crossref","unstructured":"Yin, S., et al.: NUWA-XL: diffusion over diffusion for extremely long video generation (2023)","DOI":"10.18653\/v1\/2023.acl-long.73"},{"key":"12_CR87","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation. arXiv preprint arXiv:2206.10789 (2022)"},{"key":"12_CR88","doi-asserted-by":"crossref","unstructured":"Yu, L., et al.: MAGVIT: masked generative video transformer. In: CVPR (2023). https:\/\/arxiv.org\/abs\/2212.05199","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"12_CR89","unstructured":"Zhang, S., et al.: I2VGEN-XL: high-quality image-to-video synthesis via cascaded diffusion models. arXiv preprint arXiv:2311.04145 (2023)"},{"key":"12_CR90","unstructured":"Zhang, Y., Wei, Y., Jiang, D., Zhang, X., Zuo, W., Tian, Q.: ControlVideo: training-free controllable text-to-video generation (2023)"},{"key":"12_CR91","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-73033-7_12","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T00:36:15Z","timestamp":1730334975000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-73033-7_12"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,31]]},"ISBN":["9783031730320","9783031730337"],"references-count":91,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-73033-7_12","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,10,31]]},"assertion":[{"value":"31 October 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}