{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T00:49:48Z","timestamp":1778806188952,"version":"3.51.4"},"reference-count":83,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"9","license":[{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,1]],"date-time":"2025-09-01T00:00:00Z","timestamp":1756684800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62202014"],"award-info":[{"award-number":["62202014"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62332002"],"award-info":[{"award-number":["62332002"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62425101"],"award-info":[{"award-number":["62425101"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62088102"],"award-info":[{"award-number":["62088102"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Open research fund of Pengcheng Laboratory","award":["2024KF1A0020"],"award-info":[{"award-number":["2024KF1A0020"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2025,9]]},"DOI":"10.1109\/tpami.2025.3558507","type":"journal-article","created":{"date-parts":[[2025,4,8]],"date-time":"2025-04-08T13:39:46Z","timestamp":1744119586000},"page":"7340-7351","source":"Crossref","is-referenced-by-count":11,"title":["<i>MagicTime:<\/i> Time-Lapse Video Generation Models as Metamorphic Simulators"],"prefix":"10.1109","volume":"47","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-4850-2278","authenticated-orcid":false,"given":"Shenghai","family":"Yuan","sequence":"first","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0081-4106","authenticated-orcid":false,"given":"Jinfa","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-7594-7616","authenticated-orcid":false,"given":"Yujun","family":"Shi","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-1635-6130","authenticated-orcid":false,"given":"Yongqi","family":"Xu","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4864-8474","authenticated-orcid":false,"given":"Ruijie","family":"Zhu","sequence":"additional","affiliation":[{"name":"University of California, Santa Cruz, Santa Cruz, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4805-9730","authenticated-orcid":false,"given":"Bin","family":"Lin","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9034-279X","authenticated-orcid":false,"given":"Xinhua","family":"Cheng","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2120-5588","authenticated-orcid":false,"given":"Li","family":"Yuan","sequence":"additional","affiliation":[{"name":"Shenzhen Graduate School, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4516-9729","authenticated-orcid":false,"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"Department of Computer Science, University of Rochester, Rochester, NY, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","author":"Guo","journal-title":"arXiv:2307.04725"},{"key":"ref2","first-page":"25105","article-title":"VideoPoet: A large language model for zero-shot video generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Kondratyuk"},{"key":"ref3","article-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2024"},{"key":"ref4","first-page":"7594","article-title":"VideoComposer: Compositional video synthesis with motion controllability","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref5","article-title":"Modelscope text-to-video technical report","author":"Wang","year":"2023"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00251"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475421"},{"key":"ref9","first-page":"26135","article-title":"Free-bloom: Zero-shot text-to-video generator with LLM director and LDM animator","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Huang"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref11","first-page":"25278","article-title":"LAION-5B: An open large-scale dataset for training next generation image-text models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Schuhmann"},{"key":"ref12","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2013"},{"key":"ref13","first-page":"6309","article-title":"Neural discrete representation learning","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Van Den Oord"},{"key":"ref14","first-page":"1530","article-title":"Variational inference with normalizing flows","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Rezende"},{"key":"ref15","article-title":"Density estimation using real NVP","author":"Dinh","journal-title":"arXiv:1605.08803"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00229"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530738"},{"key":"ref18","first-page":"30105","article-title":"StyleGAN-T: Unlocking the power of GANs for fast large-scale text-to-image synthesis","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Sauer"},{"key":"ref19","first-page":"36479","article-title":"Photorealistic text-to-image diffusion models with deep language understanding","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Saharia"},{"key":"ref20","first-page":"8821","article-title":"Zero-shot text-to-image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Ramesh"},{"key":"ref21","article-title":"Hierarchical text-conditional image generation with CLIP latents","author":"Ramesh","year":"2022"},{"key":"ref22","article-title":"SDXL: Improving latent diffusion models for high-resolution image synthesis","author":"Podell","journal-title":"arXiv:2307.01952"},{"key":"ref23","article-title":"LLM-grounded diffusion: Enhancing prompt understanding of text-to-image diffusion models with large language models","author":"Lian","year":"2023","journal-title":"arXiv:2305.13655"},{"key":"ref24","first-page":"1737","article-title":"MultiDiffusion: Fusing diffusion paths for controlled image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bar-Tal"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/WACV57701.2024.00526"},{"key":"ref27","first-page":"11127","article-title":"Uni-ControlNet: All-in-one control to text-to-image diffusion models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Zhao"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i5.28226"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00844"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02156"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00175"},{"key":"ref32","article-title":"InternVid: A large-scale video-text dataset for multimodal understanding and generation","author":"Wang","journal-title":"arXiv:2307.06942"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02349-y"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3355414"},{"key":"ref35","article-title":"Make-a-video: Text-to-video generation without text-video data","author":"Singer","journal-title":"arXiv:2209.14792"},{"key":"ref36","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02271-9"},{"key":"ref39","article-title":"MagicVideo-v2: Multi-stage high-aesthetic video generation","author":"Wang","year":"2024"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00150"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_18"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3355089.3356523"},{"key":"ref44","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Ho"},{"key":"ref45","article-title":"Denoising diffusion implicit models","author":"Song","journal-title":"arXiv:2010.02502"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3486179"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3475249"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/tpami.2024.3463875"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3435448"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3399098"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3382753"},{"key":"ref52","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.342"},{"key":"ref54","article-title":"PLLaVA: Parameter-free LLaVA extension from images to videos for video dense captioning","author":"Xu","year":"2024"},{"key":"ref55","article-title":"LLaVA-next: Stronger LLMs supercharge multimodal capabilities in the wild","author":"Li","year":"2024"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2024.3448248"},{"key":"ref57","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Brown"},{"key":"ref58","article-title":"GPT-4 technical report","author":"Achiam","year":"2023"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/3709005"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00660"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-92808-6_2"},{"key":"ref62","article-title":"LanguageBind: Extending video-language pretraining to n-modality by language-based semantic alignment","author":"Zhu","journal-title":"arXiv:2310.01852"},{"key":"ref63","first-page":"10088","article-title":"QLORA: Efficient finetuning of quantized LLMs","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Dettmers"},{"key":"ref64","article-title":"MakeLongVideo","volume-title":"GitHub","author":"Duo","year":"2023"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref66","article-title":"zeroscope","volume-title":"Huggingface","author":"Sterling","year":"2023"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref69","first-page":"6626","article-title":"GANs trained by a two time-scale update rule converge to a local Nash equilibrium","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Heusel"},{"key":"ref70","article-title":"FVD: A new metric for video generation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Unterthiner"},{"key":"ref71","article-title":"GODIVA: Generating open-domain videos from natural descriptions","author":"Wu","year":"2021"},{"key":"ref72","first-page":"5288","article-title":"MSR-VTT: A large video description dataset for bridging video and language","volume-title":"Proc. IEEE\/CVF Conf. Comput. Vis. Pattern Recognit.","author":"Jun"},{"key":"ref73","article-title":"Video generation models as world simulators","volume-title":"OpenAI","author":"Brooks","year":"2024"},{"key":"ref74","article-title":"Open-SoRa: Democratizing efficient video production for all","author":"Zheng","year":"2024"},{"key":"ref75","article-title":"Open-SoRa plan: Open-source large video generation model","author":"Lin","year":"2024"},{"key":"ref76","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","journal-title":"arXiv:2408.06072"},{"key":"ref77","article-title":"LTX-video: Realtime video latent diffusion","author":"HaCohen","year":"2024"},{"key":"ref78","article-title":"Mochi 1","year":"2024"},{"key":"ref79","article-title":"SUGAR: Subject-driven video customization in a zero-shot manner","author":"Zhou","year":"2024"},{"key":"ref80","first-page":"19730","article-title":"BLIP-2: Bootstrapping language-image pre-training with frozen image encoders and large language models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Li"},{"key":"ref81","first-page":"23318","article-title":"OFA: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Wang"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00945"},{"key":"ref83","article-title":"FreeNoise: Tuning-free longer video diffusion via noise rescheduling","author":"Qiu","journal-title":"arXiv:2310.15169"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11118328\/10955140.pdf?arnumber=10955140","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,7]],"date-time":"2025-08-07T17:44:33Z","timestamp":1754588673000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10955140\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9]]},"references-count":83,"journal-issue":{"issue":"9"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3558507","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,9]]}}}