{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,2]],"date-time":"2026-05-02T14:59:17Z","timestamp":1777733957186,"version":"3.51.4"},"reference-count":58,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2025]]},"DOI":"10.1109\/access.2024.3522510","type":"journal-article","created":{"date-parts":[[2024,12,26]],"date-time":"2024-12-26T19:17:39Z","timestamp":1735240659000},"page":"1986-2003","source":"Crossref","is-referenced-by-count":3,"title":["ImproveYourVideos: Architectural Improvements for Text-to-Video Generation Pipeline"],"prefix":"10.1109","volume":"13","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-2850-6412","authenticated-orcid":false,"given":"Vladimir","family":"Arkhipkin","sequence":"first","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]},{"given":"Zein","family":"Shaheen","sequence":"additional","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9368-7524","authenticated-orcid":false,"given":"Viacheslav","family":"Vasilev","sequence":"additional","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]},{"given":"Elizaveta","family":"Dakhova","sequence":"additional","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2969-3501","authenticated-orcid":false,"given":"Konstantin","family":"Sobolev","sequence":"additional","affiliation":[{"name":"Artificial Intelligence Research Institute, Moscow, Russia"}]},{"given":"Andrey","family":"Kuznetsov","sequence":"additional","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]},{"given":"Denis","family":"Dimitrov","sequence":"additional","affiliation":[{"name":"Sber AI Research, Moscow, Russia"}]}],"member":"263","reference":[{"key":"ref1","first-page":"16784","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","volume":"162","author":"Nichol"},{"key":"ref2","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv:2204.06125"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref4","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Saharia"},{"key":"ref5","article-title":"Make-A-Video: Text-to-video generation without text-video data","author":"Singer","year":"2022","journal-title":"arXiv:2209.14792"},{"key":"ref6","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2210.02303"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref8","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","year":"2023","journal-title":"arXiv:2307.04725"},{"key":"ref9","article-title":"Emu video: Factorizing text-to-video generation by explicit image conditioning","author":"Girdhar","year":"2023","journal-title":"arXiv:2311.10709"},{"key":"ref10","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv:2311.15127"},{"key":"ref11","article-title":"Make pixels dance: High-dynamic video generation","author":"Zeng","year":"2023","journal-title":"arXiv:2311.10982"},{"key":"ref12","article-title":"Hierarchical spatio-temporal decoupling for text-to-video generation","author":"Qing","year":"2023","journal-title":"arXiv:2312.04483"},{"key":"ref13","article-title":"Show-1: Marrying pixel and latent diffusion models for text-to-video generation","author":"Zhang","year":"2023","journal-title":"arXiv:2309.15818"},{"key":"ref14","first-page":"23412","article-title":"MoVQ: Modulating quantized vectors for high-fidelity image generation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Zheng"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123309"},{"key":"ref16","article-title":"Stochastic variational video prediction","author":"Babaeizadeh","year":"2017","journal-title":"arXiv:1710.11252"},{"key":"ref17","article-title":"FitVid: Overfitting in pixel-level video prediction","author":"Babaeizadeh","year":"2021","journal-title":"arXiv:2106.13195"},{"key":"ref18","article-title":"VideoGPT: Video generation using VQ-VAE and transformers","author":"Yan","year":"2021","journal-title":"arXiv:2104.10157"},{"key":"ref19","article-title":"Predicting video with VQVAE","author":"Walker","year":"2021","journal-title":"arXiv:2103.01950"},{"key":"ref20","first-page":"613","article-title":"Generating videos with scene dynamics","volume-title":"Proc. 30th Int. Conf. Neural Inf. Process. Syst.","author":"Vondrick"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3127905"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12233"},{"key":"ref23","article-title":"Stochastic adversarial video prediction","author":"Lee","year":"2018","journal-title":"arXiv:1804.01523"},{"key":"ref24","article-title":"Adversarial video generation on complex datasets","author":"Clark","year":"2019","journal-title":"arXiv:1907.06571"},{"key":"ref25","article-title":"VideoFlow: A conditional flow-based model for stochastic video generation","author":"Kumar","year":"2019","journal-title":"arXiv:1903.01434"},{"key":"ref26","article-title":"GODIVA: Generating open-domain videos from natural descriptions","author":"Wu","year":"2021","journal-title":"arXiv:2104.14806"},{"key":"ref27","first-page":"720","article-title":"N\u00fcwa: Visual synthesis pre-training for neural visual world creation","volume-title":"Proc. Eur. Conf. Comput. Vis.","author":"Wu"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19790-1_7"},{"key":"ref29","article-title":"CogVideo: Large-scale pretraining for text-to-video generation via transformers","author":"Hong","year":"2022","journal-title":"arXiv:2205.15868"},{"key":"ref30","article-title":"Phenaki: Variable length video generation from open domain textual description","author":"Villegas","year":"2022","journal-title":"arXiv:2210.02399"},{"key":"ref31","first-page":"16890","article-title":"CogView2: Faster and better text-to-image generation via hierarchical transformers","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ding"},{"key":"ref32","article-title":"Latent video diffusion models for high-fidelity long video generation","author":"He","year":"2022","journal-title":"arXiv:2211.13221"},{"key":"ref33","article-title":"MagicVideo: Efficient video generation with latent diffusion models","author":"Zhou","year":"2022","journal-title":"arXiv:2211.11018"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"ref35","article-title":"Video diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2204.03458"},{"key":"ref36","article-title":"Tune-A-video: One-shot tuning of image diffusion models for text-to-video generation","author":"Wu","year":"2022","journal-title":"arXiv:2212.11565"},{"key":"ref37","article-title":"VideoGen: A reference-guided latent diffusion approach for high definition text-to-video generation","author":"Li","year":"2023","journal-title":"arXiv:2309. 00398"},{"key":"ref38","article-title":"Kandinsky 3.0 technical report","volume-title":"arXiv:2312.03511","author":"Arkhipkin","year":"2023"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"key":"ref40","article-title":"Progressive distillation for fast sampling of diffusion models","author":"Salimans","year":"2022","journal-title":"arXiv:2202.00512"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1212.0402"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.571"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-018-01144-2"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref45","article-title":"Towards accurate generative models of video: A new metric & challenges","author":"Unterthiner","year":"2018","journal-title":"arXiv:1812.01717"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-020-01333-y"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00361"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"ref49","first-page":"22139","article-title":"Evalcrafter: Benchmarking and evaluating large video generation models","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit. (CVPR)","author":"Liu"},{"key":"ref50","volume-title":"Gen2","year":"2023"},{"key":"ref51","volume-title":"PikaLab V1.0","year":"2023"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref53","volume-title":"MoonValley","year":"2023"},{"key":"ref54","article-title":"LAVIE: High-quality video generation with cascaded latent diffusion models","author":"Wang","year":"2023","journal-title":"arXiv:2309.15103"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1093\/oed\/1871017213"},{"key":"ref56","article-title":"VideoCrafter1: Open diffusion models for high-quality video generation","author":"Chen","year":"2023","journal-title":"arXiv:2310. 19512"},{"key":"ref57","article-title":"ModelScope text-to-video technical report","volume-title":"arXiv:2308.06571","author":"Wang","year":"2023"},{"key":"ref58","volume-title":"ZeroScope","year":"2023"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10820123\/10815947.pdf?arnumber=10815947","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,6]],"date-time":"2025-01-06T19:47:27Z","timestamp":1736192847000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10815947\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"references-count":58,"URL":"https:\/\/doi.org\/10.1109\/access.2024.3522510","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]}}}