{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T20:59:16Z","timestamp":1767646756520,"version":"3.48.0"},"reference-count":71,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,1]],"date-time":"2025-06-01T00:00:00Z","timestamp":1748736000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["72125009"],"award-info":[{"award-number":["72125009"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2022YFF1203001"],"award-info":[{"award-number":["2022YFF1203001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Circuits Syst. Video Technol."],"published-print":{"date-parts":[[2025,6]]},"DOI":"10.1109\/tcsvt.2025.3531390","type":"journal-article","created":{"date-parts":[[2025,1,20]],"date-time":"2025-01-20T14:07:34Z","timestamp":1737382054000},"page":"6034-6046","source":"Crossref","is-referenced-by-count":4,"title":["Spatio-Temporal Energy-Guided Diffusion Model for Zero-Shot Video Synthesis and Editing"],"prefix":"10.1109","volume":"35","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1905-8053","authenticated-orcid":false,"given":"Ling","family":"Yang","sequence":"first","affiliation":[{"name":"Institute of Medical Technology, Peking University Health Science Center, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2166-4607","authenticated-orcid":false,"given":"Yikai","family":"Zhao","sequence":"additional","affiliation":[{"name":"Department of Computer Science, National University of Singapore, Cluny Road, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6720-3666","authenticated-orcid":false,"given":"Zhaochen","family":"Yu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0999-6231","authenticated-orcid":false,"given":"Bohan","family":"Zeng","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9735-3767","authenticated-orcid":false,"given":"Minkai","family":"Xu","sequence":"additional","affiliation":[{"name":"Computer Science Department, Stanford University, Stanford, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7521-5127","authenticated-orcid":false,"given":"Shenda","family":"Hong","sequence":"additional","affiliation":[{"name":"Institute of Medical Technology, Peking University Health Science Center, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1681-4677","authenticated-orcid":false,"given":"Bin","family":"Cui","sequence":"additional","affiliation":[{"name":"Institute of Medical Technology, Peking University Health Science Center, Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW59228.2023.00091"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19784-0_41"},{"key":"ref3","article-title":"ZoeDepth: Zero-shot transfer by combining relative and metric depth","author":"Bhat","year":"2023","journal-title":"arXiv:2302.12288"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.143"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02106"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3369757"},{"key":"ref10","article-title":"Control-A-video: Controllable text-to-video diffusion models with motion prior and reward feedback learning","author":"Chen","year":"2023","journal-title":"arXiv:2305.13840"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"ref12","first-page":"8780","article-title":"Diffusion models beat GANs on image synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dhariwal"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i1.25133"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"ref16","article-title":"TokenFlow: Consistent diffusion features for consistent video editing","author":"Geyer","year":"2023","journal-title":"arXiv:2307.10373"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3330920"},{"key":"ref18","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","year":"2023","journal-title":"arXiv:2307.04725"},{"key":"ref19","first-page":"1","article-title":"Prompt-to-prompt image editing with cross attention control","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Hertz"},{"key":"ref20","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2210.02303"},{"key":"ref21","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref22","first-page":"1","article-title":"Classifier-free diffusion guidance","volume-title":"Proc. NeurIPS","author":"Ho"},{"key":"ref23","article-title":"Video diffusion models","author":"Ho","year":"2022","journal-title":"arXiv:2204.03458"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1603.08155"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"issue":"3","key":"ref26","first-page":"1731","article-title":"A tutorial on energy-based learning","volume":"1","author":"LeCun","year":"2006","journal-title":"Predicting Struct. Data"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3414412"},{"key":"ref28","article-title":"Video-P2P: Video editing with cross-attention control","author":"Liu","year":"2023","journal-title":"arXiv:2303.04761"},{"key":"ref29","article-title":"DPM-solver: A fast ODE solver for diffusion probabilistic model sampling in around 10 steps","author":"Lu","year":"2022","journal-title":"arXiv:2206.00927"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28206"},{"key":"ref31","first-page":"1058","article-title":"SDEdit: Guided image synthesis and editing with stochastic differential equations","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Meng"},{"key":"ref32","article-title":"Null-text inversion for editing real images using guided diffusion models","author":"Mokady","year":"2022","journal-title":"arXiv:2211.09794"},{"key":"ref33","article-title":"Dreamix: Video diffusion models are general video editors","author":"Molad","year":"2023","journal-title":"arXiv:2302.01329"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"ref35","article-title":"GLIDE: Towards photorealistic image generation and editing with text-guided diffusion models","author":"Nichol","year":"2021","journal-title":"arXiv:2112.10741"},{"key":"ref36","article-title":"CoDeF: Content deformation fields for temporally consistent video processing","author":"Ouyang","year":"2023","journal-title":"arXiv:2308.07926"},{"key":"ref37","article-title":"ControlNeXt: Powerful and efficient control for image and video generation","author":"Peng","year":"2024","journal-title":"arXiv:2408.06070"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01460"},{"key":"ref39","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref40","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022","journal-title":"arXiv:2204.06125"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02155"},{"key":"ref43","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Saharia"},{"key":"ref44","first-page":"1","article-title":"Make-a-video: Text-to-video generation without text-video data","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Singer"},{"key":"ref45","first-page":"2256","article-title":"Deep unsupervised learning using nonequilibrium thermodynamics","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sohl-Dickstein"},{"key":"ref46","first-page":"1","article-title":"Denoising diffusion implicit models","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Song"},{"key":"ref47","first-page":"1","article-title":"Generative modeling by estimating gradients of the data distribution","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Song"},{"key":"ref48","first-page":"1","article-title":"Score-based generative modeling through stochastic differential equations","volume-title":"Proc. Int. Conf. Learn. Represent.","author":"Song"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00507"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3083257"},{"key":"ref51","article-title":"Plug-and-play diffusion features for text-driven image-to-image translation","author":"Tumanyan","year":"2022","journal-title":"arXiv:2211.12572"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref53","article-title":"EasyControl: Transfer ControlNet to video diffusion for controllable generation and interpolation","author":"Wang","year":"2024","journal-title":"arXiv:2408.13005"},{"key":"ref54","article-title":"Tune-A-video: One-shot tuning of image diffusion models for text-to-video generation","author":"Zhangjie Wu","year":"2022","journal-title":"arXiv:2212.11565"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00102"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3326293"},{"key":"ref57","article-title":"Diffusion-based scene graph to image generation with masked contrastive pre-training","author":"Yang","year":"2022","journal-title":"arXiv:2211.11138"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1145\/3626235"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3610548.3618160"},{"key":"ref60","article-title":"CogVideoX: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv:2408.06072"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2023.3286841"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_20"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02118"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02271-9"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2024.3375330"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2021.3131738"},{"key":"ref68","article-title":"ControlVideo: Training-free controllable text-to-video generation","author":"Zhang","year":"2023","journal-title":"arXiv:2305.13077"},{"key":"ref69","first-page":"3609","article-title":"EGSDE: Unpaired image-to-image translation via energy-guided stochastic differential equations","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"35","author":"Zhao"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/s11432-023-4184-4"},{"key":"ref71","article-title":"MagicVideo: Efficient video generation with latent diffusion models","author":"Zhou","year":"2022","journal-title":"arXiv:2211.11018"}],"container-title":["IEEE Transactions on Circuits and Systems for Video Technology"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/76\/11027896\/10845865.pdf?arnumber=10845865","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,5]],"date-time":"2026-01-05T18:42:06Z","timestamp":1767638526000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10845865\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6]]},"references-count":71,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tcsvt.2025.3531390","relation":{},"ISSN":["1051-8215","1558-2205"],"issn-type":[{"type":"print","value":"1051-8215"},{"type":"electronic","value":"1558-2205"}],"subject":[],"published":{"date-parts":[[2025,6]]}}}