{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T07:12:13Z","timestamp":1778051533444,"version":"3.51.4"},"reference-count":99,"publisher":"IEEE","license":[{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,3,6]],"date-time":"2026-03-06T00:00:00Z","timestamp":1772755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026,3,6]]},"DOI":"10.1109\/wacv61042.2026.00626","type":"proceedings-article","created":{"date-parts":[[2026,5,5]],"date-time":"2026-05-05T19:59:32Z","timestamp":1778011172000},"page":"6474-6485","source":"Crossref","is-referenced-by-count":0,"title":["T2VWorldBench: A Benchmark for Evaluating World Knowledge in Text-to-Video Generation"],"prefix":"10.1109","author":[{"given":"Yubin","family":"Chen","sequence":"first","affiliation":[{"name":"San Jose State University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuyang","family":"Guo","sequence":"additional","affiliation":[{"name":"Guilin University of Electronic Technology"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenmei","family":"Shi","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhao","family":"Song","sequence":"additional","affiliation":[{"name":"University of California,Berkeley"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiahao","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Washington"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","article-title":"Pixverse v4.5","year":"2025"},{"key":"ref2","article-title":"Wan: Open and advanced large-scale video generative models","year":"2025"},{"key":"ref3","article-title":"How to capture higher-order correlations? generalizing matrix softmax attention to kronecker computation","volume-title":"The Twelfth International Conference on Learning Representations","author":"Alman"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1963"},{"key":"ref5","article-title":"Fast rope attention: Combining the polynomial method and fast fourier transform","author":"Alman","year":"2025"},{"key":"ref6","article-title":"Only large weights (and not skip connections) can prevent the perils of rank collapse","author":"Alman","year":"2025"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.02161"},{"key":"ref8","article-title":"Unleash the power of ai image generator","year":"2024"},{"key":"ref9","article-title":"Teaching video diffusion model with latent physical phenomenon knowledge","author":"Cao","year":"2024"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3746252.3761227"},{"key":"ref11","article-title":"Towards high-order mean flow generative models: Feasibility, expressivity, and provably efficient criteria","author":"Cao","year":"2025"},{"key":"ref12","article-title":"Richspace: Enriching text-to-video prompt space via text embedding interpolation","author":"Cao","year":"2025"},{"key":"ref13","article-title":"Text-to-image diffusion models cannot count, and prompt refinement cannot help","author":"Cao","year":"2025"},{"key":"ref14","article-title":"Video latent flow matching: Optimal polynomial projections for video interpolation and extrapolation","author":"Cao","year":"2025"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.02121"},{"key":"ref16","article-title":"High-order matching for one-step shortcut diffusion models","author":"Chen","year":"2025"},{"key":"ref17","article-title":"Nrflow: Towards noise-robust generative modeling via high-order mechanism","volume-title":"Proceedings of the 41st Conference on Uncertainty in Artificial Intelligence","author":"Chen"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2025.emnlp-main.561"},{"key":"ref19","article-title":"Provable failure of language models in learning majority boolean logic via gradient descent","author":"Chen","year":"2025"},{"key":"ref20","article-title":"Videocrafter1: Open diffusion models for high-quality video generation","author":"Chen","year":"2023"},{"key":"ref21","article-title":"Fundamental limits of visual autoregressive transformers: Universal approximation abilities","volume-title":"International Conference on Machine Learning","author":"Chen"},{"key":"ref22","article-title":"Bridging the intent gap: Knowledge-enhanced visual generation","author":"Cheng","year":"2024"},{"key":"ref23","article-title":"Tc-bench: Benchmarking temporal compositionality in text-to-video and image-to-video generation","author":"Feng","year":"2024"},{"key":"ref24","article-title":"Mochi 1","year":"2024"},{"key":"ref25","article-title":"On computational limits of flowar models: Expressivity and efficiency","author":"Gong","year":"2025"},{"key":"ref26","article-title":"Theoretical guarantees for high order trajectory refinement in generative flows","author":"Gong","year":"2025"},{"key":"ref27","article-title":"Generative adversarial nets","volume":"27","author":"Goodfellow","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01978"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-025-09422-z"},{"key":"ref30","article-title":"I2v-adapter: A general image-to-video adapter for video diffusion models","author":"Guo","year":"2023","journal-title":"CoRR"},{"key":"ref31","article-title":"Can you count to nine? a human evaluation benchmark for counting limits in modern text-to-video models","author":"Guo","year":"2025"},{"key":"ref32","article-title":"Your vision-language model can\u2019t even count to 20: Exposing the failures of vlms in compositional counting","author":"Guo","year":"2025"},{"key":"ref33","article-title":"T2vphysbench: A first-principles benchmark for physical consistency in text-to-video generation","author":"Guo","year":"2025"},{"key":"ref34","article-title":"T2vtextbench: A human evaluation benchmark for textual control in video generation models","author":"Guo","year":"2025"},{"key":"ref35","article-title":"Ltx-video: Realtime video latent diffusion","author":"HaCohen","year":"2024"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref37","article-title":"Gans trained by a two time-scale update rule converge to a local nash equilibrium","volume":"30","author":"Heusel","year":"2017","journal-title":"Advances in neural information processing systems"},{"key":"ref38","article-title":"beta-vae: Learning basic visual concepts with a constrained variational framework","volume-title":"International conference on learning representations","author":"Higgins"},{"key":"ref39","article-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers","volume-title":"The Eleventh International Conference on Learning Representations","author":"Hong"},{"key":"ref40","article-title":"On computational limits of modern hopfield models: A fine-grained complexity analysis","volume-title":"Forty-first International Conference on Machine Learning","author":"Hu"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.52202\/079017-0992"},{"key":"ref42","article-title":"Fundamental limits of prompt tuning transformers: Universality, capacity and efficiency","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Hu"},{"key":"ref43","article-title":"On statistical rates of conditional diffusion transformers: Approximation, estimation and minimax optimality","volume-title":"The Thirteenth International Conference on Learning Representations","author":"Hu"},{"key":"ref44","first-page":"17603","article-title":"Video-mage: Multi-subject and motion customization of text-to-video diffusion models","volume-title":"Proceedings of the Computer Vision and Pattern Recognition Conference","author":"Huang"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00541"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00453"},{"key":"ref49","article-title":"On computational limits and provably efficient criteria of visual autoregressive models: A fine-grained complexity analysis","author":"Ke","year":"2025"},{"key":"ref50","article-title":"Auto-encoding variational bayes","volume-title":"ICLR","author":"Kingma"},{"key":"ref51","article-title":"Kling video model","year":"2024"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6795"},{"key":"ref53","article-title":"Hofar: High-order augmentation of flow autoregressive transformers","author":"Liang","year":"2025"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01064"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3483"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02484"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2723"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02090"},{"key":"ref59","article-title":"Theoretical foundation of flow-based time series generation: Provable approximation, generalization, and efficiency","author":"Long","year":"2025"},{"key":"ref60","article-title":"Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","volume":"32","author":"Lu","year":"2019","journal-title":"Advances in neural information processing systems"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00984"},{"key":"ref62","article-title":"Towards world simulator: Crafting physical commonsense-based benchmark for video generation","author":"Meng","year":"2024"},{"key":"ref63","article-title":"Phybench: A physical commonsense benchmark for evaluating text-to-image models","author":"Meng","year":"2024"},{"key":"ref64","article-title":"Hailuo ai advances cinematic storytelling with t2v-01-director and i2v-01-director","year":"2025"},{"key":"ref65","article-title":"Openvid-1m: A large-scale high-quality dataset for text-to-video generation","volume-title":"ICLR","author":"Nan"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72691-0_4"},{"key":"ref67","article-title":"Wise: A world knowledge-informed semantic evaluation for text-to-image generation","author":"Niu","year":"2025"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72775-7_23"},{"key":"ref69","article-title":"Sora system card","year":"2024"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02199"},{"key":"ref71","article-title":"Pika labs 2.2: The future of ai-driven video generation","year":"2024"},{"key":"ref72","article-title":"Unsupervised representation learning with deep convolutional generative adversarial networks","volume-title":"ICLR","author":"Radford"},{"key":"ref73","first-page":"8748","article-title":"Learning transferable visual models from natural language supervision","volume-title":"International conference on machine learning","author":"Radford"},{"key":"ref74","first-page":"1278","article-title":"Stochastic backpropagation and approximate inference in deep generative models","volume-title":"International conference on machine learning","author":"Rezende"},{"key":"ref75","article-title":"Improved techniques for training gans","volume":"29","author":"Salimans","year":"2016","journal-title":"Advances in neural information processing systems"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i19.34248"},{"key":"ref77","article-title":"Make-a-video: Text-to-video generation without text-video data","volume-title":"The Eleventh International Conference on Learning Representations","author":"Singer"},{"key":"ref78","article-title":"Videoagent: Self-improving video generation","author":"Soni","year":"2024"},{"key":"ref79","article-title":"High-order flow matching: Unified framework and sharp statistical rates","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Su"},{"key":"ref80","article-title":"A theoretical analysis of discrete flow matching generative models","author":"Su","year":"2025"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00787"},{"key":"ref82","article-title":"Fvd: A new metric for video generation","volume-title":"ICLR","author":"Unterthiner"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1145\/3721238.3730755"},{"key":"ref84","article-title":"Respond beyond language: A benchmark for video generation in response to realistic user intents","author":"Wang","year":"2025"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2096"},{"key":"ref86","article-title":"Wan: Open and advanced large-scale video generative models","year":"2025"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01754"},{"key":"ref88","article-title":"Video as the new language for real-world decision making","author":"Yang","year":"2024"},{"key":"ref89","article-title":"Videograin: Modulating space-time attention for multi-grained video editing","author":"Yang","year":"2025"},{"key":"ref90","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024"},{"key":"ref91","article-title":"Dragnuwa: Fine-grained control in video generation by integrating text, image, and trajectory","author":"Yin","year":"2023"},{"key":"ref92","first-page":"21236","article-title":"Chronomagic-bench: A benchmark for metamorphic evaluation of text-to-time-lapse video generation","volume-title":"Advances in Neural Information Processing Systems","author":"Yuan","year":"2024"},{"key":"ref93","article-title":"Text-to-image diffusion models in generative ai: A survey","author":"Zhang","year":"2023"},{"key":"ref94","article-title":"Worldgenbench: A world-knowledge-integrated benchmark for reasoning-driven text-to-image generation","author":"Zhang","year":"2025"},{"key":"ref95","first-page":"1","article-title":"Show-1: Marrying pixel and latent diffusion models for text-to-video generation","author":"Zhang","year":"2024","journal-title":"International Journal of Computer Vision"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.52202\/079017-3093"},{"key":"ref97","article-title":"Vbench-2.0: Advancing video generation benchmark suite for intrinsic faithfulness","author":"Zheng","year":"2025"},{"key":"ref98","article-title":"Cogvideox + cogsound","year":"2024"},{"key":"ref99","first-page":"2165","article-title":"Rt-2: Vision-language-action models transfer web knowledge to robotic control","volume-title":"Conference on Robot Learning","author":"Zitkovich"}],"event":{"name":"2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)","location":"Tucson, AZ, USA","start":{"date-parts":[[2026,3,6]]},"end":{"date-parts":[[2026,3,10]]}},"container-title":["2026 IEEE\/CVF Winter Conference on Applications of Computer Vision (WACV)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11491838\/11491925\/11492176.pdf?arnumber=11492176","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T06:16:34Z","timestamp":1778048194000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11492176\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,3,6]]},"references-count":99,"URL":"https:\/\/doi.org\/10.1109\/wacv61042.2026.00626","relation":{},"subject":[],"published":{"date-parts":[[2026,3,6]]}}}