{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T05:16:55Z","timestamp":1768281415469,"version":"3.49.0"},"reference-count":69,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62021001"],"award-info":[{"award-number":["62021001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100012226","name":"Fundamental Research Funds for the Central Universities","doi-asserted-by":"publisher","award":["WK3490000007"],"award-info":[{"award-number":["WK3490000007"]}],"id":[{"id":"10.13039\/501100012226","id-type":"DOI","asserted-by":"publisher"}]},{"name":"GPU Cluster built by MCC Lab of Information Science and Technology Institution of USTC"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tmm.2025.3618540","type":"journal-article","created":{"date-parts":[[2025,10,6]],"date-time":"2025-10-06T17:37:36Z","timestamp":1759772256000},"page":"42-56","source":"Crossref","is-referenced-by-count":0,"title":["Tuning-Free High-Resolution Video Diffusion With Spatial-Temporal Latent Grouping"],"prefix":"10.1109","volume":"28","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-5398-2026","authenticated-orcid":false,"given":"Zhikai","family":"Chen","sequence":"first","affiliation":[{"name":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0818-0985","authenticated-orcid":false,"given":"Fuchen","family":"Long","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7485-9198","authenticated-orcid":false,"given":"Zhaofan","family":"Qiu","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7587-101X","authenticated-orcid":false,"given":"Ting","family":"Yao","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"MoE Key Laboratory of Brain-inspired Intelligent Perception and Cognition, University of Science and Technology of China, Hefei, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4516-9729","authenticated-orcid":false,"given":"Jiebo","family":"Luo","sequence":"additional","affiliation":[{"name":"University of Rochester, Rochester, NY, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2497-7732","authenticated-orcid":false,"given":"Tao","family":"Mei","sequence":"additional","affiliation":[{"name":"HiDream.ai Inc., Beijing, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"AnimateDiff: Animate your personalized text-to-image diffusion models without specific tuning","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Guo","year":"2024"},{"key":"ref2","first-page":"8780","article-title":"Diffusion models beat GANs on image synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Dhariwal","year":"2021"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00355"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3528233.3530757"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/tmm.2023.3284989"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3474085.3475421"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref10","first-page":"7594","article-title":"VideoComposer: Compositional video synthesis with motion controll ability","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Wang","year":"2024"},{"key":"ref11","article-title":"VideoCrafter1: Open diffusion models for high-quality video generation","author":"Chen","year":"2023"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00882"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00828"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-73027-6_27"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3516874"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2024.3476683"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2022.3203421"},{"key":"ref18","article-title":"ScaleCrafter: Tuning-free higher-resolution visual generation with diffusion models","volume-title":"Proc. Int. Conf. Learn. Representations","author":"He","year":"2023"},{"key":"ref19","article-title":"FreeNoise: Tuning-free longer video diffusion via noise rescheduling","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Qiu","year":"2024"},{"key":"ref20","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Radford","year":"2021"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"ref22","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho","year":"2020"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3381814"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01305"},{"key":"ref25","first-page":"16784","article-title":"Glide: Towards photorealistic image generation and editing with text-guided diffusion models","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Nichol","year":"2022"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3359769"},{"key":"ref27","first-page":"1","article-title":"Cascaded diffusion models for high fidelity image generation","volume":"23","author":"Ho","year":"2021","journal-title":"J. Mach. Learn. Res."},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i7.28589"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00589"},{"key":"ref30","first-page":"1737","article-title":"MultiDiffusion: Fusing diffusion paths for controlled image generation","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Bar-Tal","year":"2023"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72983-6_9"},{"key":"ref32","first-page":"70847","article-title":"Training-free diffusion model adaptation for variable-sized text-to-image synthesis","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Jin","year":"2023"},{"key":"ref33","first-page":"8633","article-title":"Video diffusion models","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Ho","year":"2022"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1145\/3581783.3612405"},{"key":"ref35","article-title":"Imagen video: High definition video generation with diffusion models","author":"Ho","year":"2022"},{"key":"ref36","article-title":"Make-a-video: Text-to-video generation without text-video data","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Singer","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-025-02349-y"},{"key":"ref38","first-page":"23371","article-title":"MCVD: Masked conditional video diffusion for prediction, generation, and interpolation","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Voleti","year":"2022"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02096"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00804"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2024.3362149"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2023.3334019"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-19833-5_28"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00319"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00043"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-023-01779-w"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00701"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01462"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72646-0_22"},{"key":"ref51","article-title":"UniCtrl: Improving the spatiotemporal consistency of text-to-video diffusion models via training-free unified attention control","author":"Chen","year":"2024","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref52","article-title":"Reuse and diffuse: Iterative denoising for text-to-video generation","author":"Gu","year":"2023"},{"key":"ref53","article-title":"Progressive growing of GANs for improved quality, stability, and variation","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Karras","year":"2018"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_24"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4408903"},{"key":"ref56","article-title":"Gen-L-Video: Multi-text to long video generation via temporal co-denoising","author":"Wang","year":"2023"},{"key":"ref57","article-title":"Diffusers: State-of-the-art diffusion models","author":"Platen","year":"2022","journal-title":"GitHub Repository"},{"key":"ref58","article-title":"SDXL: Improving latent diffusion models for high-resolution image synthesis","volume-title":"Proc. Int. Conf. Learn. Representations","author":"Podell","year":"2024"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i2.25353"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00510"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01843"},{"key":"ref62","article-title":"FVD: A new metric for video generation","volume-title":"Proc. Int. Conf. Learn. Representations Workshop","author":"Unterthiner","year":"2019"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00491"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00245"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20071-7_15"},{"key":"ref66","article-title":"Classifier-free diffusion guidance","volume-title":"Proc. Adv. Neural Inf. Process. Syst. Workshop","author":"Ho","year":"2021"},{"key":"ref67","article-title":"Tuning-free visual customization via view iterative self-attention control","author":"Li","year":"2024"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.00209"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6046\/11342315\/11194261.pdf?arnumber=11194261","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,12]],"date-time":"2026-01-12T22:02:26Z","timestamp":1768255346000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11194261\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":69,"URL":"https:\/\/doi.org\/10.1109\/tmm.2025.3618540","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}