{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T15:21:36Z","timestamp":1778080896873,"version":"3.51.4"},"reference-count":83,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,16]],"date-time":"2024-06-16T00:00:00Z","timestamp":1718496000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000038","name":"Natural Sciences and Engineering Research Council of Canada (NSERC) Discovery","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000038","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100004326","name":"Simon Fraser University","doi-asserted-by":"publisher","award":["W911NF-21-2-0104"],"award-info":[{"award-number":["W911NF-21-2-0104"]}],"id":[{"id":"10.13039\/501100004326","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000183","name":"ARO","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000183","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["1839974"],"award-info":[{"award-number":["1839974"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,16]]},"DOI":"10.1109\/cvpr52733.2024.00764","type":"proceedings-article","created":{"date-parts":[[2024,9,16]],"date-time":"2024-09-16T17:34:53Z","timestamp":1726508093000},"page":"7996-8006","source":"Crossref","is-referenced-by-count":68,"title":["4D-fy: Text-to-4D Generation Using Hybrid Score Distillation Sampling"],"prefix":"10.1109","author":[{"given":"Sherwin","family":"Bahmani","sequence":"first","affiliation":[{"name":"University of Toronto"}]},{"given":"Ivan","family":"Skorokhodov","sequence":"additional","affiliation":[{"name":"KAUST"}]},{"given":"Victor","family":"Rong","sequence":"additional","affiliation":[{"name":"University of Toronto"}]},{"given":"Gordon","family":"Wetzstein","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"Leonidas","family":"Guibas","sequence":"additional","affiliation":[{"name":"Stanford University"}]},{"given":"Peter","family":"Wonka","sequence":"additional","affiliation":[{"name":"KAUST"}]},{"given":"Sergey","family":"Tulyakov","sequence":"additional","affiliation":[{"name":"Snap Inc."}]},{"given":"Jeong Joon","family":"Park","sequence":"additional","affiliation":[{"name":"University of Michigan"}]},{"given":"Andrea","family":"Tagliasacchi","sequence":"additional","affiliation":[{"name":"University of Toronto"}]},{"given":"David B.","family":"Lindell","sequence":"additional","affiliation":[{"name":"University of Toronto"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Stable Diffusion version 2","year":"2023"},{"key":"ref2","volume-title":"Threestudio Github page","year":"2023"},{"key":"ref3","volume-title":"Zeroscope text-to-video model","year":"2023"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.3115\/980092.980121"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref6","article-title":"eDiff-I: Text-to-image diffusion models with an ensemble of expert denoisers","author":"Balaji","year":"2022","journal-title":"arXiv preprint"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00389"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1217"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-20893-6_7"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/383259.383316"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01201"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1145\/3588432.3591552"},{"key":"ref15","article-title":"Nerfdiff: Single-image view synthesis with Nerf-guided distillation from 3d-aware diffusion","volume-title":"Proc. ICML","author":"Gu","year":"2023"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01043"},{"key":"ref17","article-title":"Animatediff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref18","article-title":"Latent video diffusion models for high-fidelity video generation with arbitrary lengths","author":"He","year":"2022","journal-title":"arXiv preprint"},{"key":"ref19","article-title":"Classifier-free diffusion guidance","volume-title":"Proc. NeurIPS Workshop on Deep Generative Models","author":"Ho","year":"2021"},{"key":"ref20","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. NeurIPS","author":"Ho","year":"2020"},{"key":"ref21","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv preprint"},{"issue":"1","key":"ref22","first-page":"2249","article-title":"Cascaded diffusion models for high fidelity image generation","volume":"23","author":"Ho","year":"2022","journal-title":"The Journal of Machine Learning Research"},{"key":"ref23","article-title":"Lrm: Large reconstruction model for single image to 3D","author":"Hong","year":"2023","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Lora: Low-rank adaptation of large language models","volume-title":"Proc. ICLR","author":"Hu","year":"2021"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00094"},{"key":"ref26","article-title":"Clipmatrix: Text-controlled creation of 3D textured meshes","author":"Jetchev","year":"2021","journal-title":"arXiv preprint"},{"key":"ref27","article-title":"Consistent4D: Consistent 360\u00b0 dynamic object generation from monocular video","author":"Jiang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref28","article-title":"Instant3D: Fast text-to-3D with sparse-view generation and large reconstruction model","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref29","article-title":"Sweet-dreamer: Aligning geometric priors in 2D diffusion for con-sistent text-to-3D","author":"Li","year":"2023","journal-title":"arXiv preprint"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00037"},{"key":"ref31","article-title":"Consistent123: One image to highly consistent 3D asset using case-aware diffusion priors","author":"Lin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref32","article-title":"Align your gaussians: Text-to-4D with dynamic 4D gaussians and composed diffusion models","author":"Ling","year":"2023","journal-title":"arXiv preprint"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00853"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2006.04.045"},{"key":"ref35","article-title":"Syncdreamer: Generating multiview-consistent images from a single-view image","author":"Liu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3306346.3323020"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/3dv62453.2024.00044"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/3503250"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3528223.3530127"},{"key":"ref40","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. ICML","author":"Nichol","year":"2022"},{"key":"ref41","article-title":"Fast dynamic 3D object generation from a single-view video","author":"Pan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref42","article-title":"Benchmark for compositional text-to-image synthesis","volume-title":"Proc. NeurIPS","author":"Park","year":"2021"},{"key":"ref43","article-title":"State of the art on diffusion models for visual computing","author":"Po","year":"2023","journal-title":"arXiv preprint"},{"key":"ref44","article-title":"DreamFusion: Text-to-3D using 2D diffusion","volume-title":"Proc. ICLR","author":"Poole","year":"2023"},{"key":"ref45","article-title":"Magic123: One image to high-quality 3D object generation using both 2D and 3D diffusion priors","author":"Qian","year":"2023","journal-title":"arXiv preprint"},{"key":"ref46","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. ICML","author":"Radford","year":"2021"},{"key":"ref47","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. ICML","author":"Ramesh","year":"2021"},{"key":"ref48","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv preprint"},{"key":"ref49","article-title":"Generative adversarial text to image synthesis","volume-title":"Proc. ICML","author":"Reed","year":"2016"},{"key":"ref50","article-title":"DreamGaussian4D: Generative 4D Gaussian splatting","author":"Ren","year":"2023","journal-title":"arXiv preprint"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref52","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. NeurIPS","author":"Saharia","year":"2022"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01805"},{"key":"ref54","article-title":"Laion-5b: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. NeurIPS","author":"Schuhmann","year":"2022"},{"key":"ref55","article-title":"MVDream: Multi-view diffusion for 3d generation","author":"Shi","year":"2023","journal-title":"arXiv preprint"},{"key":"ref56","article-title":"Make-a-video: Text-to-video generation without text-video data","author":"Singer","year":"2022","journal-title":"arXiv preprint"},{"key":"ref57","article-title":"Text-to-4d dynamic scene generation","volume-title":"Proc. ICML","author":"Singer","year":"2023"},{"key":"ref58","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"Proc. ICML","author":"Sohl-Dickstein","year":"2015"},{"key":"ref59","article-title":"Denoising diffusion implicit models","volume-title":"Proc. ICLR","author":"Song","year":"2021"},{"key":"ref60","article-title":"Score-based generative modeling through stochastic differential equations","volume-title":"Proc. ICLR","author":"Song","year":"2021"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2022.3148210"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02086"},{"key":"ref63","article-title":"Diffusion with forward models: Solving stochastic inverse problems without direct supervision","author":"Tewari","year":"2023","journal-title":"arXiv preprint"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01191"},{"key":"ref65","article-title":"Phenaki: Variable length video generation from open domain textual description","author":"Villegas","year":"2022","journal-title":"arXiv preprint"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00381"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01214"},{"key":"ref68","article-title":"Modelscope text-to-video technical report","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref69","article-title":"Videofactory: Swap attention in spatiotemporal diffusions for text-to-video generation","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref70","article-title":"Videocomposer: Compositional video synthesis with motion controllability","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref71","article-title":"Prolificdreamer: High-fidelity and diverse text-to-3D generation with variational score distillation","volume-title":"Proc. NeurIPS","author":"Wang","year":"2023"},{"key":"ref72","article-title":"Lamp: Learn a motion pattern for few-shot-based video generation","author":"Wu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00143"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00498"},{"key":"ref75","article-title":"4DGen: Grounded 4D content generation with spatial-temporal consistency","author":"Yin","year":"2023","journal-title":"arXiv preprint"},{"key":"ref76","article-title":"Dreamsparse: Escaping from platos cave with 2d diffusion model given sparse views","author":"Yoo","year":"2023","journal-title":"arXiv preprint"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref78","article-title":"Scaling autoregressive models for content-rich text-to-image generation","author":"Yu","year":"2022","journal-title":"arXiv preprint"},{"key":"ref79","article-title":"Scaling autoregressive multi-modal models: Pretraining and instruction tuning","author":"Yu","year":"2023","journal-title":"arXiv preprint"},{"key":"ref80","article-title":"Stack-GAN: Text to photo-realistic image synthesis with stacked generative adversarial networks","volume-title":"Proc. ICCV","author":"Zhang","year":"2017"},{"key":"ref81","article-title":"Animate124: Animating one image to 4D dynamic scene","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref82","article-title":"A unified approach for text-and image-guided 4D scene generation","author":"Zheng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref83","article-title":"Magicvideo: Efficient video generation with latent diffusion models","author":"Zhou","year":"2022","journal-title":"arXiv preprint"}],"event":{"name":"2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Seattle, WA, USA","start":{"date-parts":[[2024,6,16]]},"end":{"date-parts":[[2024,6,22]]}},"container-title":["2024 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10654794\/10654797\/10656060.pdf?arnumber=10656060","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,19]],"date-time":"2024-09-19T06:42:33Z","timestamp":1726728153000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10656060\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,16]]},"references-count":83,"URL":"https:\/\/doi.org\/10.1109\/cvpr52733.2024.00764","relation":{},"subject":[],"published":{"date-parts":[[2024,6,16]]}}}