{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T06:11:03Z","timestamp":1758089463539,"version":"3.44.0"},"reference-count":32,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T00:00:00Z","timestamp":1750550400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013105","name":"Shanghai Rising-Star Program","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100013105","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,6,22]]},"DOI":"10.1109\/dac63849.2025.11132992","type":"proceedings-article","created":{"date-parts":[[2025,9,15]],"date-time":"2025-09-15T17:35:41Z","timestamp":1757957741000},"page":"1-7","source":"Crossref","is-referenced-by-count":0,"title":["Harnessing Conventional Video Processing Insights for Emerging 3D Video Generation Models: A Comprehensive Attention-aware Way"],"prefix":"10.1109","author":[{"given":"Tianlang","family":"Zhao","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Jun","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Xingyang","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Li","family":"Ding","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Jinhao","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Shuaiheng","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Jinbo","family":"Hu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]},{"given":"Guohao","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}]}],"member":"263","reference":[{"article-title":"Video generation models as world simulators","year":"2024","author":"Brooks","key":"ref1"},{"key":"ref2","article-title":"Stable video diffusion: Scaling latent video diffusion models to large datasets","author":"Blattmann","year":"2023","journal-title":"arXiv preprint arXiv:2311.15127"},{"key":"ref3","article-title":"Cogvideo: Largescale pretraining for text-to-video generation via transformers","author":"Hong","year":"2022","journal-title":"arXiv preprint arXiv:2205.15868"},{"article-title":"How far is video generation from world model: A physical law perspective","year":"2024","author":"Kang","key":"ref4"},{"key":"ref5","article-title":"Video as the new language for realworld decision making","author":"Yang","year":"2024","journal-title":"arXiv preprint arXiv:2402.17139"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"article-title":"Open-sora: Democratizing efficient video production for all","year":"2024","author":"Zheng","key":"ref7"},{"key":"ref8","article-title":"Latte: Latent diffusion transformer for video generation","author":"Ma","year":"2024","journal-title":"arXiv preprint arXiv:2401.03048"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01770"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-72943-0_9"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687614"},{"key":"ref12","article-title":"Cogvideox: Text-to-video diffusion models with an expert transformer","author":"Yang","year":"2024","journal-title":"arXiv preprint arXiv:2408.06072"},{"key":"ref13","article-title":"Lumina-t2x: Transforming text into any modality, resolution, and duration via flow-based large diffusion transformers","author":"Gao","year":"2024","journal-title":"arXiv preprint arXiv:2405.05945"},{"article-title":"Open-sora-plan","year":"2024","author":"Lab","key":"ref14"},{"key":"ref15","article-title":"Easyanimate: A high-performance long video generation method based on transformer architecture","author":"Xu","year":"2024","journal-title":"arXiv preprint arXiv:2405.18991"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2003.815165"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2012.2221191"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1117\/12.2188913"},{"article-title":"Discrete cosine transform-Wikipedia, the free encyclopedia","year":"2024","author":"contributors","key":"ref19"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480125"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589057"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3649329.3656237"},{"key":"ref23","article-title":"Real-time video generation with pyramid attention broadcast","author":"Zhao","year":"2024","journal-title":"arXiv preprint arXiv:2408.12588"},{"key":"ref24","article-title":"Fora: Fastforward caching in diffusion transformer acceleration","author":"Selvaraju","year":"2024","journal-title":"arXiv preprint arXiv:2407.01425"},{"key":"ref25","first-page":"6840","article-title":"Denoising diffusion probabilistic models","volume":"33","author":"Ho","year":"2020","journal-title":"Advances in neural information processing systems"},{"article-title":"Hewlettpackard\/cacti","year":"2023","author":"Naveen Muralimanohar","key":"ref26"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"ref28","article-title":"Godiva: Generating open-domain videos from natural descriptions","author":"Wu","year":"2021","journal-title":"arXiv preprint arXiv:2104.14806"},{"article-title":"NVIDIA A100 Tensor Core GPU Architecture","year":"2020","author":"Corporation","key":"ref29"},{"key":"ref30","article-title":"Python time library"},{"article-title":"Cuda event api","year":"2024","author":"NVIDIA","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640393"}],"event":{"name":"2025 62nd ACM\/IEEE Design Automation Conference (DAC)","start":{"date-parts":[[2025,6,22]]},"location":"San Francisco, CA, USA","end":{"date-parts":[[2025,6,25]]}},"container-title":["2025 62nd ACM\/IEEE Design Automation Conference (DAC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11132383\/11132091\/11132992.pdf?arnumber=11132992","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,16]],"date-time":"2025-09-16T05:24:47Z","timestamp":1758000287000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11132992\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,6,22]]},"references-count":32,"URL":"https:\/\/doi.org\/10.1109\/dac63849.2025.11132992","relation":{},"subject":[],"published":{"date-parts":[[2025,6,22]]}}}