{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T03:30:17Z","timestamp":1777865417511,"version":"3.51.4"},"reference-count":78,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01669","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"17961-17971","source":"Crossref","is-referenced-by-count":0,"title":["DOLLAR: Few-Step Video Generation Via Distillation and Latent Reward Optimization"],"prefix":"10.1109","author":[{"given":"Zihan","family":"Ding","sequence":"first","affiliation":[{"name":"Princeton University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chi","family":"Jin","sequence":"additional","affiliation":[{"name":"Princeton University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Difan","family":"Liu","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Haitian","family":"Zheng","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Krishna Kumar","family":"Singh","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qiang","family":"Zhang","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yan","family":"Kang","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhe","family":"Lin","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yuchen","family":"Liu","sequence":"additional","affiliation":[{"name":"Adobe Research"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","author":"Achiam","year":"2023","journal-title":"Gpt-4 technical report"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"ref3","author":"Black","year":"2023","journal-title":"Training diffusion models with reinforcement learning"},{"key":"ref4","author":"Blattmann","year":"2023","journal-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref7","author":"Chung","year":"2022","journal-title":"Diffusion posterior sampling for general noisy inverse problems"},{"key":"ref8","author":"Clark","year":"2023","journal-title":"Directly fine-tuning diffusion models on differentiable rewards"},{"key":"ref9","author":"Domingo-Enrich","year":"2024","journal-title":"Adjoint matching: Fine-tuning flow and diffusion generative models with memoryless stochastic optimal control"},{"key":"ref10","author":"Esser","year":"2023","journal-title":"Structureaware video generation with latent diffusion models"},{"key":"ref11","article-title":"Scaling rectified flow transformers for high-resolution image synthesis","volume-title":"Forty-first international conference on machine learning","author":"Esser"},{"key":"ref12","author":"Friedman","year":"2022","journal-title":"The vendi score: A diversity evaluation metric for machine learning"},{"key":"ref13","article-title":"Generative adversarial nets","volume":"27","author":"Goodfellow","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2027"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.127"},{"key":"ref16","author":"Ho","year":"2022","journal-title":"Classifier-free diffusion guidance"},{"key":"ref17","first-page":"6840","article-title":"Denoising diffusion probabilistic models","author":"Ho","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref18","author":"Ho","year":"2022","journal-title":"Video diffusion models"},{"key":"ref19","author":"Hong","year":"2022","journal-title":"Cogvideo: Large-scale pretraining for text-to-video generation with transformers"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref21","author":"Jin","year":"2024","journal-title":"Pyramidal flow matching for efficient video generative modeling"},{"key":"ref22","author":"Khachatryan","year":"2023","journal-title":"Text2video-zero: Zero-shot text-to-video generation using pretrained text-to-image diffusion models"},{"key":"ref23","author":"Kim","year":"2023","journal-title":"Consistency trajectory models: Learning probability flow ode trajectory of diffusion"},{"key":"ref24","author":"Kingma","year":"2013","journal-title":"Auto-encoding variational bayes"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.52202\/075280-1594"},{"key":"ref26","author":"Kong","year":"2024","journal-title":"Hunyuanvideo: A systematic framework for large video generative models"},{"key":"ref28","author":"Li","year":"2024","journal-title":"Reward guided latent consistency distillation"},{"key":"ref29","author":"Li","year":"2024","journal-title":"T2v-turbo: Breaking the quality bottleneck of video consistency model with mixed reward feedback"},{"key":"ref30","author":"Li","year":"2024","journal-title":"T2v-turbo-v2: Enhancing video generation model post-training through data, reward, and conditional guidance design"},{"key":"ref31","author":"Lipman","year":"2022","journal-title":"Flow matching for generative modeling"},{"key":"ref32","author":"Liu","year":"2022","journal-title":"Flow straight and fast: Learning to generate and transfer data with rectified flow"},{"key":"ref33","article-title":"Instaflow: One step is enough for high-quality diffusionbased text-to-image generation","volume-title":"The Twelfth International Conference on Learning Representations","author":"Liu"},{"key":"ref34","author":"Lu","year":"2024","journal-title":"Simplifying, stabilizing and scaling continuous-time consistency models"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.14711\/thesis-hdl151306"},{"key":"ref36","author":"Luo","year":"2023","journal-title":"Latent consistency models: Synthesizing high-resolution images with few-step inference"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.52202\/075280-3344"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"ref39","volume-title":"Pika Labs. Pika Labs","year":"2023"},{"key":"ref40","author":"Polino","year":"2018","journal-title":"Model compression via distillation and quantization"},{"key":"ref41","author":"Polyak","year":"2025","journal-title":"Movie gen: A cast of media foundation models"},{"key":"ref42","author":"Polyak","year":"2024","journal-title":"Movie gen: A cast of media foundation models"},{"key":"ref43","author":"Poole","year":"2022","journal-title":"Dreamfusion: Text-to-3d using 2d diffusion"},{"key":"ref44","author":"Prabhudesai","year":"2024","journal-title":"Video diffusion alignment via reward gradients"},{"key":"ref45","author":"Ren","year":"2024","journal-title":"Diffusion policy policy optimization"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref47","author":"Salimans","year":"2022","journal-title":"Progressive distillation for fast sampling of diffusion models"},{"key":"ref48","author":"Salimans","year":"2024","journal-title":"Multistep distillation of diffusion models via moment matching"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687625"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73016-0_6"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1833"},{"key":"ref52","author":"Singer","year":"2022","journal-title":"Make-a-video: Text-to-video generation without text-video data"},{"key":"ref53","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"International Conference on Machine Learning","author":"Sohl-Dickstein"},{"key":"ref54","article-title":"Denoising diffusion implicit models","volume-title":"International Conference on Learning Representations","author":"Song"},{"key":"ref55","article-title":"Generative modeling by estimating gradients of the data distribution","volume":"32","author":"Song","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref56","article-title":"Score-based generative modeling through stochastic differential equations","volume-title":"International Conference on Learning Representations","author":"Song"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/960126.806879"},{"key":"ref58","author":"Villegas","year":"2022","journal-title":"Phenaki: Variable length video generation from open domain textual descriptions"},{"key":"ref59","author":"Wang","year":"2024","journal-title":"Animatelcm: Accelerating the animation of personalized diffusion models and adapters with decoupled consistency learning"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"ref61","author":"Wang","year":"2023","journal-title":"Modelscope text-to-video technical report"},{"key":"ref62","author":"Wang","year":"2023","journal-title":"Videolcm: Video latent consistency model"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref64","author":"Wang","year":"2023","journal-title":"Internvid: A large-scale video-text dataset for multimodal understanding and generation"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73013-9_23"},{"key":"ref66","article-title":"Prolificdreamer: High-fidelity and diverse text-to-3d generation with variational score distillation","volume":"36","author":"Wang","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref67","author":"Wu","year":"2023","journal-title":"Human preference score v2: A solid benchmark for evaluating human preferences of text-toimage synthesis"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00200"},{"key":"ref69","author":"Xiao","year":"2021","journal-title":"Tackling the generative learning trilemma with denoising diffusion gans"},{"key":"ref70","author":"Xiao","year":"2023","journal-title":"Dual diffusion models for high-fidelity video generation"},{"key":"ref71","author":"Xie","year":"2024","journal-title":"Em distillation for one-step diffusion models"},{"key":"ref72","article-title":"Imagereward: Learning and evaluating human preferences for text-to-image generation","volume":"36","author":"Xu","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref73","author":"Yang","year":"2024","journal-title":"Cogvideox: Text-to-video diffusion models with an expert transformer"},{"key":"ref74","author":"Yin","year":"2024","journal-title":"Improved distribution matching distillation for fast image synthesis"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00632"},{"key":"ref76","author":"Zhang","year":"2025","journal-title":"Fast video generation with sliding tile attention"},{"key":"ref77","author":"Zhang","year":"2024","journal-title":"Sf-v: Single forward video generation model"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"},{"key":"ref79","author":"Zheng","year":"2024","journal-title":"Open-sora: Democratizing efficient video production for all"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11444240.pdf?arnumber=11444240","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T06:29:56Z","timestamp":1777530596000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11444240\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":78,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01669","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}