{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,2]],"date-time":"2026-06-02T10:28:33Z","timestamp":1780396113684,"version":"3.54.1"},"publisher-location":"Cham","reference-count":91,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031729850","type":"print"},{"value":"9783031729867","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,2]],"date-time":"2024-11-02T00:00:00Z","timestamp":1730505600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-3-031-72986-7_23","type":"book-chapter","created":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:08:46Z","timestamp":1730437726000},"page":"393-411","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":83,"title":["Photorealistic Video Generation with\u00a0Diffusion Models"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5213-0224","authenticated-orcid":false,"given":"Agrim","family":"Gupta","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0645-1657","authenticated-orcid":false,"given":"Lijun","family":"Yu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4303-8319","authenticated-orcid":false,"given":"Kihyuk","family":"Sohn","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5568-564X","authenticated-orcid":false,"given":"Xiuye","family":"Gu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-1549-088X","authenticated-orcid":false,"given":"Meera","family":"Hahn","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7481-0810","authenticated-orcid":false,"given":"Fei-Fei","family":"Li","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6236-2969","authenticated-orcid":false,"given":"Irfan","family":"Essa","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0286-8439","authenticated-orcid":false,"given":"Lu","family":"Jiang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-2936-875X","authenticated-orcid":false,"given":"Jos\u00e9","family":"Lezama","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2024,11,2]]},"reference":[{"key":"23_CR1","unstructured":"Agostinelli, A., et\u00a0al.: MusicLM: generating music from text (2023). arXiv:2301.11325"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Bao, F., Li, C., Cao, Y., Zhu, J.: All are worth words: a ViT backbone for score-based diffusion models. In: NeurIPS 2022 Workshop on Score-Based Methods (2022)","DOI":"10.1109\/CVPR52729.2023.02171"},{"key":"23_CR3","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., Shazeer, N.: Scheduled sampling for sequence prediction with recurrent neural networks. In: Advances in Neural Information Processing Systems, vol. 28 (2015)"},{"key":"23_CR4","unstructured":"Blattmann, A., et\u00a0al.: Stable video diffusion: scaling latent video diffusion models to large datasets. arXiv preprint arXiv:2311.15127 (2023)"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Blattmann, A., et al.: Align your latents: high-resolution video synthesis with latent diffusion models. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"23_CR6","unstructured":"Bousmalis, K., et\u00a0al.: RoboCat: a self-improving foundation agent for robotic manipulation. arXiv preprint arXiv:2306.11706 (2023)"},{"key":"23_CR7","unstructured":"Brock, A., Donahue, J., Simonyan, K.: Large scale GAN training for high fidelity natural image synthesis. In: ICLR (2018)"},{"key":"23_CR8","unstructured":"Brohan, A., et\u00a0al.: RT-1: robotics transformer for real-world control at scale. arXiv preprint arXiv:2212.06817 (2022)"},{"key":"23_CR9","doi-asserted-by":"crossref","unstructured":"Caron, M., et al.: Emerging properties in self-supervised vision transformers. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 9650\u20139660 (2021)","DOI":"10.1109\/ICCV48922.2021.00951"},{"key":"23_CR10","unstructured":"Carreira, J., Noland, E., Banki-Horvath, A., Hillier, C., Zisserman, A.: A short note about Kinetics-600 (2018). arXiv:1808.01340"},{"key":"23_CR11","unstructured":"Chang, H., et\u00a0al.: Muse: text-to-image generation via masked generative transformers. In: ICML (2023)"},{"key":"23_CR12","doi-asserted-by":"crossref","unstructured":"Chang, H., Zhang, H., Jiang, L., Liu, C., Freeman, W.T.: MaskGIT: masked generative image transformer. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01103"},{"key":"23_CR13","unstructured":"Chen, J., et\u00a0al.: PixArt-$$\\alpha $$: fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)"},{"key":"23_CR14","unstructured":"Chen, T., Zhang, R., Hinton, G.: Analog bits: generating discrete data using diffusion models with self-conditioning. arXiv preprint arXiv:2208.04202 (2022)"},{"key":"23_CR15","unstructured":"Dehghani, M., et\u00a0al.: Scaling vision transformers to 22 billion parameters. In: International Conference on Machine Learning, pp. 7480\u20137512. PMLR (2023)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"23_CR17","unstructured":"Dhariwal, P., Nichol, A.: Diffusion models beat GANs on image synthesis. In: NeurIPS (2021)"},{"key":"23_CR18","unstructured":"Ding, M., et\u00a0al.: CogView: mastering text-to-image generation via transformers. In: NeurIPS (2021)"},{"key":"23_CR19","unstructured":"Dosovitskiy, A., et\u00a0al.: An image is worth $$16\\times 16$$ words: transformers for image recognition at scale. In: ICLR (2020)"},{"key":"23_CR20","unstructured":"Dumoulin, V., Shlens, J., Kudlur, M.: A learned representation for artistic style. arXiv preprint arXiv:1610.07629 (2016)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Esser, P., Rombach, R., Ommer, B.: Taming transformers for high-resolution image synthesis. In: CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"23_CR22","doi-asserted-by":"publisher","first-page":"89","DOI":"10.1007\/978-3-031-19784-0_6","volume-title":"European Conference on Computer Vision","author":"O Gafni","year":"2022","unstructured":"Gafni, O., Polyak, A., Ashual, O., Sheynin, S., Parikh, D., Taigman, Y.: Make-a-scene: scene-based text-to-image generation with human priors. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13675, pp. 89\u2013106. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19784-0_6"},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Gao, S., Zhou, P., Cheng, M.M., Yan, S.: Masked diffusion transformer is a strong image synthesizer (2023). arXiv:2303.14389","DOI":"10.1109\/ICCV51070.2023.02117"},{"key":"23_CR24","doi-asserted-by":"publisher","first-page":"102","DOI":"10.1007\/978-3-031-19790-1_7","volume-title":"ECCV 2022","author":"S Ge","year":"2022","unstructured":"Ge, S., et al.: Long video generation with time-agnostic VQGAN and time-sensitive transformer. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13677, pp. 102\u2013118. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19790-1_7"},{"key":"23_CR25","doi-asserted-by":"crossref","unstructured":"Ge, S., et al.: Preserve your own correlation: a noise prior for video diffusion models. arXiv preprint arXiv:2305.10474 (2023)","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Girdhar, R., et al.: Emu video: factorizing text-to-video generation by explicit image conditioning. arXiv preprint arXiv:2311.10709 (2023)","DOI":"10.1007\/978-3-031-73033-7_12"},{"key":"23_CR27","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: Advances in Neural Information Processing Systems, vol. 27 (2014)"},{"key":"23_CR28","unstructured":"Google: PaLM 2 technical report (2023). arXiv:2305.10403"},{"key":"23_CR29","unstructured":"Gupta, A., Tian, S., Zhang, Y., Wu, J., Mart\u00edn-Mart\u00edn, R., Fei-Fei, L.: MaskViT: masked visual pre-training for video prediction. In: ICLR (2022)"},{"key":"23_CR30","unstructured":"Gupta, A., Wu, J., Deng, J., Fei-Fei, L.: Siamese masked autoencoders. arXiv preprint arXiv:2305.14344 (2023)"},{"key":"23_CR31","unstructured":"Harvey, W., Naderiparizi, S., Masrani, V., Weilbach, C., Wood, F.: Flexible diffusion modeling of long videos. In: Advances in Neural Information Processing Systems, vol. 35, pp. 27953\u201327965 (2022)"},{"key":"23_CR32","doi-asserted-by":"crossref","unstructured":"He, K., Chen, X., Xie, S., Li, Y., Doll\u00e1r, P., Girshick, R.: Masked autoencoders are scalable vision learners. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01553"},{"key":"23_CR33","unstructured":"He, Y., Yang, T., Zhang, Y., Shan, Y., Chen, Q.: Latent video diffusion models for high-fidelity long video generation. arXiv preprint arXiv:2211.13221 (2023)"},{"key":"23_CR34","unstructured":"Heusel, M., Ramsauer, H., Unterthiner, T., Nessler, B., Hochreiter, S.: GANs trained by a two time-scale update rule converge to a local Nash equilibrium. In: NeurIPS (2017)"},{"key":"23_CR35","unstructured":"Ho, J., et\u00a0al.: Imagen video: high definition video generation with diffusion models (2022). arXiv:2210.02303"},{"key":"23_CR36","unstructured":"Ho, J., Jain, A., Abbeel, P.: Denoising diffusion probabilistic models. In: NeurIPS (2020)"},{"issue":"1","key":"23_CR37","first-page":"2249","volume":"23","author":"J Ho","year":"2022","unstructured":"Ho, J., Saharia, C., Chan, W., Fleet, D.J., Norouzi, M., Salimans, T.: Cascaded diffusion models for high fidelity image generation. JMLR 23(1), 2249\u20132281 (2022)","journal-title":"JMLR"},{"key":"23_CR38","unstructured":"Ho, J., Salimans, T., Gritsenko, A., Chan, W., Norouzi, M., Fleet, D.J.: Video diffusion models. In: ICLR Workshops (2022)"},{"key":"23_CR39","unstructured":"Hong, W., Ding, M., Zheng, W., Liu, X., Tang, J.: CogVideo: large-scale pretraining for text-to-video generation via transformers (2022). arXiv:2205.15868"},{"key":"23_CR40","unstructured":"Hoogeboom, E., Heek, J., Salimans, T.: Simple diffusion: end-to-end diffusion for high resolution images. In: ICML (2023)"},{"key":"23_CR41","unstructured":"Hu, E.J., et al.: LoRA: low-rank adaptation of large language models. In: ICLR (2021)"},{"key":"23_CR42","unstructured":"Jabri, A., Fleet, D.J., Chen, T.: Scalable adaptive computation for iterative generation. In: ICML (2023)"},{"key":"23_CR43","unstructured":"Jiang, Y., Chang, S., Wang, Z.: TransGAN: two pure transformers can make one strong GAN, and that can scale up. In: Advances in Neural Information Processing Systems, vol. 34, pp. 14745\u201314758 (2021)"},{"key":"23_CR44","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"694","DOI":"10.1007\/978-3-319-46475-6_43","volume-title":"Computer Vision \u2013 ECCV 2016","author":"J Johnson","year":"2016","unstructured":"Johnson, J., Alahi, A., Fei-Fei, L.: Perceptual losses for real-time style transfer and super-resolution. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9906, pp. 694\u2013711. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46475-6_43"},{"key":"23_CR45","doi-asserted-by":"crossref","unstructured":"Karras, T., Laine, S., Aila, T.: A style-based generator architecture for generative adversarial networks. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00453"},{"key":"23_CR46","unstructured":"Kingma, D.P., Gao, R.: Understanding the diffusion objective as a weighted integral of ELBOs (2023). arXiv:2303.00848"},{"issue":"11","key":"23_CR47","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y LeCun","year":"1998","unstructured":"LeCun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998)","journal-title":"Proc. IEEE"},{"key":"23_CR48","unstructured":"Lee, K., Chang, H., Jiang, L., Zhang, H., Tu, Z., Liu, C.: ViTGAN: training GANs with vision transformers. arXiv preprint arXiv:2107.04589 (2021)"},{"key":"23_CR49","doi-asserted-by":"crossref","unstructured":"Lin, S., Liu, B., Li, J., Yang, X.: Common diffusion noise schedules and sample steps are flawed. arXiv preprint arXiv:2305.08891 (2023)","DOI":"10.1109\/WACV57701.2024.00532"},{"key":"23_CR50","doi-asserted-by":"crossref","unstructured":"Liu, Z., et al.: Swin transformer: hierarchical vision transformer using shifted windows. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 10012\u201310022 (2021)","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"23_CR51","unstructured":"Lu, H., et al.: VDT: general-purpose video diffusion transformers via mask modeling. arXiv preprint arXiv:2305.13311 (2023)"},{"key":"23_CR52","unstructured":"Luc, P., et al.: Transformation-based adversarial video prediction on large-scale data (2020). arXiv:2003.04035"},{"key":"23_CR53","unstructured":"Nichol, A.Q., Dhariwal, P.: Improved denoising diffusion probabilistic models. In: International Conference on Machine Learning, pp. 8162\u20138171. PMLR (2021)"},{"key":"23_CR54","doi-asserted-by":"crossref","unstructured":"Peebles, W., Xie, S.: Scalable diffusion models with transformers (2022). arXiv:2212.09748","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"23_CR55","doi-asserted-by":"crossref","unstructured":"Perez, E., Strub, F., De\u00a0Vries, H., Dumoulin, V., Courville, A.: FiLM: visual reasoning with a general conditioning layer. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol.\u00a032 (2018)","DOI":"10.1609\/aaai.v32i1.11671"},{"key":"23_CR56","unstructured":"Radford, A., et\u00a0al.: Learning transferable visual models from natural language supervision. In: ICML (2021)"},{"key":"23_CR57","unstructured":"Radford, A., Kim, J.W., Xu, T., Brockman, G., McLeavey, C., Sutskever, I.: Robust speech recognition via large-scale weak supervision. In: International Conference on Machine Learning, pp. 28492\u201328518. PMLR (2023)"},{"key":"23_CR58","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"issue":"8","key":"23_CR59","first-page":"9","volume":"1","author":"A Radford","year":"2019","unstructured":"Radford, A., et al.: Language models are unsupervised multitask learners. OpenAI Blog 1(8), 9 (2019)","journal-title":"OpenAI Blog"},{"key":"23_CR60","unstructured":"Ramesh, A., et al.: Zero-shot text-to-image generation. In: ICML (2021)"},{"key":"23_CR61","unstructured":"Roberts, A., et al.: Exploring the limits of transfer learning with a unified text-to-text transformer (2019)"},{"key":"23_CR62","doi-asserted-by":"crossref","unstructured":"Rombach, R., Blattmann, A., Lorenz, D., Esser, P., Ommer, B.: High-resolution image synthesis with latent diffusion models. In: CVPR (2022)","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"23_CR63","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1007\/978-3-319-24574-4_28","volume-title":"Medical Image Computing and Computer-Assisted Intervention \u2013 MICCAI 2015","author":"O Ronneberger","year":"2015","unstructured":"Ronneberger, O., Fischer, P., Brox, T.: U-Net: convolutional networks for biomedical image segmentation. In: Navab, N., Hornegger, J., Wells, W.M., Frangi, A.F. (eds.) MICCAI 2015. LNCS, vol. 9351, pp. 234\u2013241. Springer, Cham (2015). https:\/\/doi.org\/10.1007\/978-3-319-24574-4_28"},{"key":"23_CR64","unstructured":"Salimans, T., Goodfellow, I., Zaremba, W., Cheung, V., Radford, A., Chen, X.: Improved techniques for training GANs. In: NeurIPS (2016)"},{"key":"23_CR65","unstructured":"Salimans, T., Ho, J.: Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:2202.00512 (2022)"},{"key":"23_CR66","unstructured":"Savinov, N., Chung, J., Binkowski, M., Elsen, E., van den Oord, A.: Step-unrolled denoising autoencoders for text generation. arXiv preprint arXiv:2112.06749 (2021)"},{"key":"23_CR67","unstructured":"Singer, U., et\u00a0al.: Make-a-video: text-to-video generation without text-video data (2022). arXiv:2209.14792"},{"key":"23_CR68","unstructured":"Sohl-Dickstein, J., Weiss, E., Maheswaranathan, N., Ganguli, S.: Deep unsupervised learning using nonequilibrium thermodynamics. In: ICML (2015)"},{"key":"23_CR69","unstructured":"Song, J., Meng, C., Ermon, S.: Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)"},{"key":"23_CR70","unstructured":"Song, Y., Ermon, S.: Generative modeling by estimating gradients of the data distribution. In: NeurIPS (2019)"},{"key":"23_CR71","unstructured":"Soomro, K., Zamir, A.R., Shah, M.: UCF101: a dataset of 101 human actions classes from videos in the wild (2012). arXiv:1212.0402"},{"key":"23_CR72","unstructured":"Unterthiner, T., van Steenkiste, S., Kurach, K., Marinier, R., Michalski, M., Gelly, S.: Towards accurate generative models of video: a new metric & challenges (2018). arXiv:1812.01717"},{"key":"23_CR73","unstructured":"Van Den\u00a0Oord, A., Vinyals, O., et\u00a0al.: Neural discrete representation learning. In: NeurIPS (2017)"},{"key":"23_CR74","unstructured":"Vaswani, A., et al.: Attention is all you need. In: NeurIPS (2017)"},{"key":"23_CR75","unstructured":"Villegas, R., et al.: Phenaki: variable length video generation from open domain textual description (2022). arXiv:2210.02399"},{"key":"23_CR76","doi-asserted-by":"crossref","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.A.: Extracting and composing robust features with denoising autoencoders. In: Proceedings of the 25th International Conference on Machine Learning, pp. 1096\u20131103 (2008)","DOI":"10.1145\/1390156.1390294"},{"key":"23_CR77","unstructured":"Wu, C., et al.: GODIVA: generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806 (2021)"},{"key":"23_CR78","doi-asserted-by":"publisher","first-page":"720","DOI":"10.1007\/978-3-031-19787-1_41","volume-title":"European Conference on Computer Vision 2022","author":"C Wu","year":"2022","unstructured":"Wu, C., et al.: N\u00dcWA: visual synthesis pre-training for neural visual world creation. In: Avidan, S., Brostow, G., Ciss\u00e9, M., Farinella, G.M., Hassner, T. (eds.) ECCV 2022. LNCS, vol. 13676, pp. 720\u2013736. Springer, Cham (2022). https:\/\/doi.org\/10.1007\/978-3-031-19787-1_41"},{"key":"23_CR79","unstructured":"Yan, W., Zhang, Y., Abbeel, P., Srinivas, A.: VideoGPT: video generation using VQ-VAE and transformers (2021). arXiv:2104.10157"},{"key":"23_CR80","unstructured":"Yu, J., et al.: Vector-quantized image modeling with improved VQGAN. In: ICLR (2022)"},{"key":"23_CR81","unstructured":"Yu, J., et\u00a0al.: Scaling autoregressive models for content-rich text-to-image generation (2022). arXiv:2206.10789"},{"key":"23_CR82","doi-asserted-by":"crossref","unstructured":"Yu, L., et\u00a0al.: MAGVIT: masked generative video transformer. In: CVPR (2023)","DOI":"10.1109\/CVPR52729.2023.01008"},{"key":"23_CR83","unstructured":"Yu, L., et\u00a0al.: Language model beats diffusion\u2013tokenizer is key to visual generation. arXiv preprint arXiv:2310.05737 (2023)"},{"key":"23_CR84","doi-asserted-by":"crossref","unstructured":"Yu, S., Sohn, K., Kim, S., Shin, J.: Video probabilistic diffusion models in projected latent space. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 18456\u201318466 (2023)","DOI":"10.1109\/CVPR52729.2023.01770"},{"key":"23_CR85","doi-asserted-by":"crossref","unstructured":"Zeng, Y., et al.: Make pixels dance: high-dynamic video generation. arXiv preprint arXiv:2311.10982 (2023)","DOI":"10.1109\/CVPR52733.2024.00845"},{"key":"23_CR86","doi-asserted-by":"crossref","unstructured":"Zhai, X., Kolesnikov, A., Houlsby, N., Beyer, L.: Scaling vision transformers. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 12104\u201312113 (2022)","DOI":"10.1109\/CVPR52688.2022.01179"},{"key":"23_CR87","doi-asserted-by":"crossref","unstructured":"Zhang, B., et al.: StyleSwin: transformer-based GAN for high-resolution image generation. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 11304\u201311314 (2022)","DOI":"10.1109\/CVPR52688.2022.01102"},{"key":"23_CR88","doi-asserted-by":"crossref","unstructured":"Zhang, R., Isola, P., Efros, A.A., Shechtman, E., Wang, O.: The unreasonable effectiveness of deep features as a perceptual metric. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00068"},{"key":"23_CR89","unstructured":"Zheng, H., Nie, W., Vahdat, A., Anandkumar, A.: Fast training of diffusion models with masked transformers (2023). arXiv:2306.09305"},{"key":"23_CR90","unstructured":"Zhou, D., Wang, W., Yan, H., Lv, W., Zhu, Y., Feng, J.: MagicVideo: efficient video generation with latent diffusion models. arXiv preprint arXiv:2211.11018 (2022)"},{"key":"23_CR91","unstructured":"Zitkovich, B., et\u00a0al.: RT-2: vision-language-action models transfer web knowledge to robotic control. In: CoRL (2023)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2024"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-72986-7_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T05:13:07Z","timestamp":1730437987000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-72986-7_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,2]]},"ISBN":["9783031729850","9783031729867"],"references-count":91,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-72986-7_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,11,2]]},"assertion":[{"value":"2 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Milan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Italy","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 September 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 October 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2024.ecva.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}