{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T01:09:11Z","timestamp":1769044151759,"version":"3.49.0"},"reference-count":25,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,10,22]],"date-time":"2025-10-22T00:00:00Z","timestamp":1761091200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,10,22]]},"DOI":"10.1109\/cbmi66578.2025.11339268","type":"proceedings-article","created":{"date-parts":[[2026,1,20]],"date-time":"2026-01-20T20:38:56Z","timestamp":1768941536000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["SeqBench: Benchmarking Sequential Narrative Generation in Text-to-Video Models"],"prefix":"10.1109","author":[{"given":"Zhengxu","family":"Tang","sequence":"first","affiliation":[{"name":"University of Michigan,Department of Electrical and Computer Engineering,Ann Arbor,United States"}]},{"given":"Zizheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Northeastern University,Department of Mechanical and Industrial Engineering,Boston,United States"}]},{"given":"Luning","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Michigan,Department of Electrical and Computer Engineering,Ann Arbor,United States"}]},{"given":"Zitao","family":"Shuai","sequence":"additional","affiliation":[{"name":"University of Michigan,Department of Electrical and Computer Engineering,Ann Arbor,United States"}]},{"given":"Chenhao","family":"Zhang","sequence":"additional","affiliation":[{"name":"Paul G. Allen School of Computer Science and Engineering, University of Washington,Seattle,United States"}]},{"given":"Siyu","family":"Qian","sequence":"additional","affiliation":[{"name":"School of Engineering and Applied Sciences, Harvard University,Cambridge,United State"}]},{"given":"Yirui","family":"Wu","sequence":"additional","affiliation":[{"name":"School of Electronic and Information Engineering, Beijing Jiaotong University,Beijing,China"}]},{"given":"Bohao","family":"Wang","sequence":"additional","affiliation":[{"name":"College of Information Science and Electronic Engineering, Zhejiang University,Hangzhou,China"}]},{"given":"Haosong","family":"Rao","sequence":"additional","affiliation":[{"name":"Georgen Institute for Data Science, University of Rochester,Rochester,United States"}]},{"given":"Zhenyu","family":"Yang","sequence":"additional","affiliation":[{"name":"School of Earth Sciences, Zhejiang University,Hangzhou,China"}]},{"given":"Chenwei","family":"Wu","sequence":"additional","affiliation":[{"name":"Georgen Institute for Data Science, University of Rochester,Rochester,United States"}]}],"member":"263","reference":[{"key":"ref1","volume-title":"Video generation models as world simulators","author":"Brooks","year":"2024"},{"key":"ref2","volume-title":"From sora what we can see: A survey of text-to-video generation","author":"Sun","year":"2024"},{"key":"ref3","volume-title":"Evaluation of text-to-video generation models: A dynamics perspective","author":"Liao","year":"2024"},{"key":"ref4","volume-title":"Pika: Text-to-video generation platform","author":"Labs","year":"2024"},{"key":"ref5","volume-title":"Kling: Text-to-video generation model","author":"Technology","year":"2024"},{"key":"ref6","article-title":"Tc-bench: Benchmarking temporal compositionality in text-to-video and image-to-video generation","author":"Feng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref7","article-title":"Towards world simulator: Crafting physical commonsense-based benchmark for video generation","author":"Meng","year":"2024","journal-title":"arXiv preprint"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.displa.2025.103178"},{"key":"ref9","article-title":"Vbench+i-: Comprehensive and ver-satile benchmark suite for video generative models","author":"Huang","year":"2024","journal-title":"ar Xiv preprint"},{"key":"ref10","author":"Zhang","year":"2024","journal-title":"Benchmarking aigc video quality assessment: A dataset and unified model"},{"key":"ref11","first-page":"63 858","article-title":"T2vsafetybench: Evaluating the safety of text-to-video generative models","volume":"37","author":"Miao","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52734.2025.01272"},{"key":"ref13","article-title":"A survey on evaluation of multimodal large language models","author":"Huang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref14","article-title":"Exploring perceptual limitation of multimodal large language models","author":"Zhang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipm.2023.103510"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/s44267-025-00099-6"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.wnu-1.4"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i12.33426"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/cvpr52733.2024.01289"},{"key":"ref20","volume-title":"Distilling transitional pattern to large language models for multimodal session-based recommendation","author":"Su","year":"2025"},{"key":"ref21","volume-title":"Introducing gen-3 alpha: A new frontier for video generation","year":"2025"},{"key":"ref22","volume-title":"Ray2: A new frontier in video generative models","author":"Labs","year":"2025"},{"key":"ref23","volume-title":"Veo 2: Advanced text-to-video generation model","year":"2025"},{"key":"ref24","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv preprint"},{"key":"ref25","volume-title":"Hailuo ai: Text-to-video generation platform","year":"2025"}],"event":{"name":"2025 International Conference on Content-Based Multimedia Indexing (CBMI)","location":"Dublin, Ireland","start":{"date-parts":[[2025,10,22]]},"end":{"date-parts":[[2025,10,24]]}},"container-title":["2025 International Conference on Content-Based Multimedia Indexing (CBMI)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11339229\/11339242\/11339268.pdf?arnumber=11339268","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,21]],"date-time":"2026-01-21T07:35:20Z","timestamp":1768980920000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11339268\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,22]]},"references-count":25,"URL":"https:\/\/doi.org\/10.1109\/cbmi66578.2025.11339268","relation":{},"subject":[],"published":{"date-parts":[[2025,10,22]]}}}