{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T09:59:46Z","timestamp":1777888786899,"version":"3.51.4"},"reference-count":48,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,19]],"date-time":"2025-10-19T00:00:00Z","timestamp":1760832000000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,19]]},"DOI":"10.1109\/iccv51701.2025.01691","type":"proceedings-article","created":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T19:45:49Z","timestamp":1777491949000},"page":"18197-18206","source":"Crossref","is-referenced-by-count":0,"title":["TokensGen: Harnessing Condensed Tokens for Long Video Generation"],"prefix":"10.1109","author":[{"given":"Wenqi","family":"Ouyang","sequence":"first","affiliation":[{"name":"Nanyang Technological University,S-Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zeqi","family":"Xiao","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Danni","family":"Yang","sequence":"additional","affiliation":[{"name":"SenseTime Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yifan","family":"Zhou","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuai","family":"Yang","sequence":"additional","affiliation":[{"name":"Wangxuan Institute of Computer Technology, Peking University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lei","family":"Yang","sequence":"additional","affiliation":[{"name":"SenseTime Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianlou","family":"Si","sequence":"additional","affiliation":[{"name":"SenseTime Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xingang","family":"Pan","sequence":"additional","affiliation":[{"name":"Nanyang Technological University,S-Lab"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref2","volume-title":"Hunyuanvideo: A systematic framework for large video generative models","year":"2024"},{"key":"ref3","volume-title":"sora","year":"2024"},{"key":"ref4","volume-title":"Gpt-4o. chatgpt.com","year":"2025"},{"key":"ref5","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","volume":"2311.15127","author":"Blattmann","year":"2023","journal-title":"arXiv preprint"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.52202\/068431-2303"},{"key":"ref7","article-title":"Ditctrl: Exploring attention control in multi-modal diffusion transformer for tuning-free multi-prompt longer video generation","author":"Cai","year":"2024","journal-title":"2412.18597"},{"key":"ref8","volume-title":"Pyscenedetect","author":"Castellano","year":"2024"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00698"},{"key":"ref10","article-title":"Seine: Short-to-long video diffusion model for generative transition and prediction","volume-title":"ICLR","author":"Chen","year":"2023"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.52202\/075280-0106"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2024\/1118"},{"issue":"11","key":"ref13","doi-asserted-by":"crossref","first-page":"559","DOI":"10.1080\/14786440109462720","article-title":"Liii. on lines and planes of closest fit to systems of points in space","volume":"2","author":"F.R.S.","year":"1901","journal-title":"The London, Edinburgh, and Dublin Philosophical Magazine and Journal of Science"},{"key":"ref14","article-title":"Videostudio: Generating consistent-content and multi-scene videos","volume-title":"ECCV","author":"Yao","year":"2024"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72946-1_19"},{"key":"ref16","article-title":"Animatediff: Animate your personalized text-toimage diffusion models without specific tuning","volume-title":"International Conference on Learning Representations","author":"Guo","year":"2024"},{"key":"ref17","author":"He","year":"2022","journal-title":"Latent video diffusion models for high-fidelity long video generation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.00245"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref20","article-title":"Pyramidal flow matching for efficient video generative modeling","author":"Jin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref21","author":"Ju","year":"2024","journal-title":"Miradata: A large-scale video dataset with long durations and structured captions"},{"key":"ref22","article-title":"Fifo-diffusion: Generating infinite videos from text without training","author":"Kim","year":"2024","journal-title":"arXiv preprint"},{"key":"ref23","article-title":"Auto-encoding variational bayes","author":"Kingma","year":"2013","journal-title":"arXiv preprint"},{"key":"ref24","article-title":"Open-sora plan: Open-source large video generation model","author":"Lin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref25","author":"Lin","year":"2023","journal-title":"Videodirectorgpt: Consistent multi-scene video generation via llm-guided planning"},{"key":"ref26","article-title":"Freelong: Training-free long video generation with spectralblend temporal attention","author":"Lu","year":"2024","journal-title":"arXiv preprint"},{"key":"ref27","article-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2024","journal-title":"arXiv preprint"},{"key":"ref28","article-title":"Tuning-free long video generation via global-local collaborative diffusion","author":"Ma","year":"2025","journal-title":"arXiv preprint"},{"key":"ref29","author":"Qiu","year":"2023","journal-title":"Freenoise: Tuning-free longer video diffusion via noise rescheduling"},{"key":"ref30","article-title":"Rolling diffusion models","author":"Ruhe","year":"2024","journal-title":"arXiv preprint"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"key":"ref32","article-title":"Video-infinity: Distributed long video generation","author":"Tan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref33","volume-title":"Genmo Team. Mochi 1","year":"2024"},{"key":"ref34","article-title":"Videotetris: Towards compositional text-to-video generation","author":"Tian","year":"2024","journal-title":"arXiv preprint"},{"key":"ref35","article-title":"Gen-l-video: Multi-text to long video generation via temporal co-denoising","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref36","article-title":"Modelscope text-to-video technical report","author":"Wang","year":"2023","journal-title":"arXiv preprint"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-024-02295-1"},{"key":"ref38","article-title":"Loong: Generating minute-level long videos with autoregressive language models","author":"Wang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref39","article-title":"Progressive autoregressive video diffusion models","author":"Xie","year":"2024","journal-title":"arXiv preprint"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72952-2_23"},{"key":"ref41","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.73"},{"key":"ref43","article-title":"From slow bidirectional to fast causal video generators","author":"Yin","year":"2024","journal-title":"arXiv preprint"},{"key":"ref44","article-title":"Mora: Enabling generalist video generation via a multi-agent framework","author":"Yuan","year":"2024","journal-title":"arXiv preprint"},{"key":"ref45","volume-title":"I2vgen-xl: High-quality image-to-video synthesis via cascaded diffusion models","author":"Zhang","year":"2023"},{"key":"ref46","author":"Zhao","year":"2024","journal-title":"Moviedreamer: Hierarchical generation for coherent long visual sequence"},{"key":"ref47","author":"Zheng","year":"2024","journal-title":"Open-sora: Democratizing efficient video production for all"},{"key":"ref48","article-title":"Storydiffusion: Consistent self-attention for long-range image and video generation","author":"Zhou","year":"2024","journal-title":"NeurIPS 2024"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00841"}],"event":{"name":"2025 IEEE\/CVF International Conference on Computer Vision (ICCV)","location":"Honolulu, HI, USA","start":{"date-parts":[[2025,10,19]]},"end":{"date-parts":[[2025,10,25]]}},"container-title":["2025 IEEE\/CVF International Conference on Computer Vision (ICCV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11443115\/11443287\/11446063.pdf?arnumber=11446063","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T04:58:09Z","timestamp":1777611489000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11446063\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,19]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/iccv51701.2025.01691","relation":{},"subject":[],"published":{"date-parts":[[2025,10,19]]}}}