{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:59:40Z","timestamp":1777568380560,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":57,"publisher":"ACM","funder":[{"name":"National Natural Science Foundation of China"},{"name":"Tsinghua University Initiative Scientific Research Program"},{"name":"Tsinghua-Efort Joint Research Center for EAI Computation and Perception"},{"name":"Beijing National Research Center for Information Science, Technology (BNRist)"},{"name":"Beijing Innovation Center for Future Chips, and State Key laboratory of Space Network and Communications"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,10,27]]},"DOI":"10.1145\/3746027.3755638","type":"proceedings-article","created":{"date-parts":[[2025,10,25]],"date-time":"2025-10-25T07:27:39Z","timestamp":1761377259000},"page":"10388-10397","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["DLFR-VAE: Dynamic Latent Frame Rate VAE for Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7846-0240","authenticated-orcid":false,"given":"Zhihang","family":"Yuan","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China and Infinigence AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0934-1572","authenticated-orcid":false,"given":"Siyuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2286-6668","authenticated-orcid":false,"given":"Yuzhang","family":"Shang","sequence":"additional","affiliation":[{"name":"University of Central Florida, Orlando, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7388-8145","authenticated-orcid":false,"given":"Hanling","family":"Zhang","sequence":"additional","affiliation":[{"name":"Infinigence AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7279-8359","authenticated-orcid":false,"given":"Tongcheng","family":"Fang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-1318-0924","authenticated-orcid":false,"given":"Rui","family":"Xie","sequence":"additional","affiliation":[{"name":"Infinigence AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-3858-7972","authenticated-orcid":false,"given":"Shengen","family":"Yan","sequence":"additional","affiliation":[{"name":"Infinigence AI, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-7000-6537","authenticated-orcid":false,"given":"Guohao","family":"Dai","sequence":"additional","affiliation":[{"name":"Shanghai Jiaotong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6108-5157","authenticated-orcid":false,"given":"Yu","family":"Wang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,10,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Niket Agarwal Arslan Ali Maciej Bala Yogesh Balaji Erik Barker Tiffany Cai Prithvijit Chattopadhyay Yongxin Chen Yin Cui Yifan Ding et al. 2025. Cosmos world foundation model platform for physical ai. arXiv preprint arXiv:2501.03575 (2025)."},{"key":"e_1_3_2_1_2_1","volume-title":"Information theory","author":"Ash Robert B","unstructured":"Robert B Ash. 2012. Information theory. Courier Corporation."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1111\/roiw.12510"},{"key":"e_1_3_2_1_4_1","unstructured":"Tim Brooks Bill Peebles Connor Holmes Will DePue Yufei Guo Li Jing David Schnurr Joe Taylor Troy Luhman Eric Luhman Clarence Ng Ricky Wang and Aditya Ramesh. 2024. Video generation models as world simulators. (2024). https:\/\/openai.com\/research"},{"key":"e_1_3_2_1_5_1","volume-title":"Deep compression autoencoder for efficient high-resolution diffusion models. arXiv preprint arXiv:2410.10733","author":"Chen Junyu","year":"2024","unstructured":"Junyu Chen, Han Cai, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, and Song Han. 2024. Deep compression autoencoder for efficient high-resolution diffusion models. arXiv preprint arXiv:2410.10733 (2024)."},{"key":"e_1_3_2_1_6_1","volume-title":"Extending context window of large language models via positional interpolation. arXiv preprint arXiv:2306.15595","author":"Chen Shouyuan","year":"2023","unstructured":"Shouyuan Chen, Sherman Wong, Liangjian Chen, and Yuandong Tian. 2023. Extending context window of large language models via positional interpolation. arXiv preprint arXiv:2306.15595 (2023)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00675"},{"key":"e_1_3_2_1_8_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et al., 2024. Scaling rectified flow transformers for high-resolution image synthesis. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_2_1_10_1","volume-title":"Fluid: Scaling autoregressive text-to-image generative models with continuous tokens. arXiv preprint arXiv:2410.13863","author":"Fan Lijie","year":"2024","unstructured":"Lijie Fan, Tianhong Li, Siyang Qin, Yuanzhen Li, Chen Sun, Michael Rubinstein, Deqing Sun, Kaiming He, and Yonglong Tian. 2024. Fluid: Scaling autoregressive text-to-image generative models with continuous tokens. arXiv preprint arXiv:2410.13863 (2024)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00630"},{"key":"e_1_3_2_1_12_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. Advances in neural information processing systems, Vol. 33 (2020), 6840-6851."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_1_14_1","volume-title":"Pyramidal flow matching for efficient video generative modeling. arXiv preprint arXiv:2410.05954","author":"Jin Yang","year":"2024","unstructured":"Yang Jin, Zhicheng Sun, Ningyuan Li, Kun Xu, Hao Jiang, Nan Zhuang, Quzhe Huang, Yang Song, Yadong Mu, and Zhouchen Lin. 2024. Pyramidal flow matching for efficient video generative modeling. arXiv preprint arXiv:2410.05954 (2024)."},{"key":"e_1_3_2_1_15_1","volume-title":"Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114","author":"Kingma Diederik P","year":"2013","unstructured":"Diederik P Kingma. 2013. Auto-encoding variational bayes. arXiv preprint arXiv:1312.6114 (2013)."},{"key":"e_1_3_2_1_16_1","volume-title":"Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125","author":"Kondratyuk Dan","year":"2023","unstructured":"Dan Kondratyuk, Lijun Yu, Xiuye Gu, Jos\u00e9 Lezama, Jonathan Huang, Grant Schindler, Rachel Hornung, Vighnesh Birodkar, Jimmy Yan, Ming-Chang Chiu, et al., 2023. Videopoet: A large language model for zero-shot video generation. arXiv preprint arXiv:2312.14125 (2023)."},{"key":"e_1_3_2_1_17_1","unstructured":"Weijie Kong Qi Tian Zijian Zhang Rox Min Zuozhuo Dai Jin Zhou Jiangfeng Xiong Xin Li Bo Wu Jianwei Zhang et al. 2024. HunyuanVideo: A Systematic Framework For Large Video Generative Models. arXiv preprint arXiv:2412.03603 (2024)."},{"key":"e_1_3_2_1_18_1","unstructured":"Kuaishou. 2024. KLING VIDEO MODEL. (2024). https:\/\/kling.kuaishou.com"},{"key":"e_1_3_2_1_19_1","volume-title":"Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007","author":"Li Muyang","year":"2024","unstructured":"Muyang Li, Yujun Lin, Zhekai Zhang, Tianle Cai, Xiuyu Li, Junxian Guo, Enze Xie, Chenlin Meng, Jun-Yan Zhu, and Song Han. 2024a. Svdqunat: Absorbing outliers by low-rank components for 4-bit diffusion models. arXiv preprint arXiv:2411.05007 (2024)."},{"key":"e_1_3_2_1_20_1","volume-title":"Autoregressive Image Generation without Vector Quantization. arXiv preprint arXiv:2406.11838","author":"Li Tianhong","year":"2024","unstructured":"Tianhong Li, Yonglong Tian, He Li, Mingyang Deng, and Kaiming He. 2024b. Autoregressive Image Generation without Vector Quantization. arXiv preprint arXiv:2406.11838 (2024)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/WACV51458.2022.00090"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.01608"},{"key":"e_1_3_2_1_23_1","volume-title":"SpVOS: Efficient Video Object Segmentation With Triple Sparse Convolution","author":"Lin Weihao","year":"2023","unstructured":"Weihao Lin, Tao Chen, and Chong Yu. 2023. SpVOS: Efficient Video Object Segmentation With Triple Sparse Convolution. IEEE Transactions on Image Processing (2023)."},{"key":"e_1_3_2_1_24_1","volume-title":"Fr'echet Video Motion Distance: A Metric for Evaluating Motion Consistency in Videos. arXiv preprint arXiv:2407.16124","author":"Liu Jiahe","year":"2024","unstructured":"Jiahe Liu, Youran Qu, Qi Yan, Xiaohui Zeng, Lele Wang, and Renjie Liao. 2024. Fr'echet Video Motion Distance: A Metric for Evaluating Motion Consistency in Videos. arXiv preprint arXiv:2407.16124 (2024)."},{"key":"e_1_3_2_1_25_1","first-page":"5775","article-title":"Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps","volume":"35","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022a. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. Advances in Neural Information Processing Systems, Vol. 35 (2022), 5775-5787.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_26_1","volume-title":"Dpm-solver: Fast solver for guided sampling of diffusion probabilistic models. arXiv preprint arXiv:2211.01095","author":"Lu Cheng","year":"2022","unstructured":"Cheng Lu, Yuhao Zhou, Fan Bao, Jianfei Chen, Chongxuan Li, and Jun Zhu. 2022b. Dpm-solver: Fast solver for guided sampling of diffusion probabilistic models. arXiv preprint arXiv:2211.01095 (2022)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01492"},{"key":"e_1_3_2_1_28_1","volume-title":"Efficient diffusion models: A comprehensive survey from principles to practices. arXiv preprint arXiv:2410.11795","author":"Ma Zhiyuan","year":"2024","unstructured":"Zhiyuan Ma, Yuzhu Zhang, Guoli Jia, Liangliang Zhao, Yichao Ma, Mingjie Ma, Gaofeng Liu, Kaiyan Zhang, Jianjun Li, and Bowen Zhou. 2024b. Efficient diffusion models: A comprehensive survey from principles to practices. arXiv preprint arXiv:2410.11795 (2024)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICIP.2015.7351436"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2880603"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3524273.3532896"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-AIEE.1928.5055024"},{"key":"e_1_3_2_1_33_1","unstructured":"Theofanis Papakonstantinou. 2023. Content Based Video Encoding Based on Spatial and TemporalInformation."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_2_1_35_1","volume-title":"YaRN: Efficient Context Window Extension of Large Language Models. In The Twelfth International Conference on Learning Representations.","author":"Peng Bowen","unstructured":"Bowen Peng, Jeffrey Quesnelle, Honglu Fan, and Enrico Shippole. [n.d.]. YaRN: Efficient Context Window Extension of Large Language Models. In The Twelfth International Conference on Learning Representations."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11042-023-17815-3"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00196"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/JRPROC.1949.232969"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/76.915357"},{"key":"e_1_3_2_1_41_1","volume-title":"Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502","author":"Song Jiaming","year":"2020","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2020. Denoising diffusion implicit models. arXiv preprint arXiv:2010.02502 (2020)."},{"key":"e_1_3_2_1_42_1","volume-title":"Visual autoregressive modeling: Scalable image generation via next-scale prediction. arXiv preprint arXiv:2404.02905","author":"Tian Keyu","year":"2024","unstructured":"Keyu Tian, Yi Jiang, Zehuan Yuan, Bingyue Peng, and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. arXiv preprint arXiv:2404.02905 (2024)."},{"key":"e_1_3_2_1_43_1","volume-title":"Attention is all you need. Advances in Neural Information Processing Systems","author":"Vaswani A","year":"2017","unstructured":"A Vaswani. 2017. Attention is all you need. Advances in Neural Information Processing Systems (2017)."},{"key":"e_1_3_2_1_44_1","volume-title":"Quest: Low-bit diffusion model quantization via efficient selective finetuning. arXiv preprint arXiv:2402.03666","author":"Wang Haoxuan","year":"2024","unstructured":"Haoxuan Wang, Yuzhang Shang, Zhihang Yuan, Junyi Wu, Junchi Yan, and Yan Yan. 2024. Quest: Low-bit diffusion model quantization via efficient selective finetuning. arXiv preprint arXiv:2402.03666 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806","author":"Wu Chenfei","year":"2021","unstructured":"Chenfei Wu, Lun Huang, Qianxi Zhang, Binyang Li, Lei Ji, Fan Yang, Guillermo Sapiro, and Nan Duan. 2021. Godiva: Generating open-domain videos from natural descriptions. arXiv preprint arXiv:2104.14806 (2021)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20068-7_31"},{"key":"e_1_3_2_1_47_1","volume-title":"PTQ4DiT: Post-training Quantization for Diffusion Transformers. arXiv preprint arXiv:2405.16005","author":"Wu Junyi","year":"2024","unstructured":"Junyi Wu, Haoxuan Wang, Yuzhang Shang, Mubarak Shah, and Yan Yan. 2024a. PTQ4DiT: Post-training Quantization for Diffusion Transformers. arXiv preprint arXiv:2405.16005 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Improved Video VAE for Latent Video Diffusion Model. arXiv preprint arXiv:2411.06449","author":"Wu Pingyu","year":"2024","unstructured":"Pingyu Wu, Kai Zhu, Yu Liu, Liming Zhao, Wei Zhai, Yang Cao, and Zheng-Jun Zha. 2024b. Improved Video VAE for Latent Video Diffusion Model. arXiv preprint arXiv:2411.06449 (2024)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00343"},{"key":"e_1_3_2_1_50_1","volume-title":"Large Motion Video Autoencoding with Cross-modal Video VAE. arXiv preprint arXiv:2412.17805","author":"Xing Yazhou","year":"2024","unstructured":"Yazhou Xing, Yang Fei, Yingqing He, Jingye Chen, Jiaxin Xie, Xiaowei Chi, and Qifeng Chen. 2024. Large Motion Video Autoencoding with Cross-modal Video VAE. arXiv preprint arXiv:2412.17805 (2024)."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00067"},{"key":"e_1_3_2_1_52_1","volume-title":"Efficient Video Diffusion Models via Content-Frame Motion-Latent Decomposition. arXiv preprint arXiv:2403.14148","author":"Yu Sihyun","year":"2024","unstructured":"Sihyun Yu, Weili Nie, De-An Huang, Boyi Li, Jinwoo Shin, and Anima Anandkumar. 2024. Efficient Video Diffusion Models via Content-Frame Motion-Latent Decomposition. arXiv preprint arXiv:2403.14148 (2024)."},{"key":"e_1_3_2_1_53_1","volume-title":"Ditfastattn: Attention compression for diffusion transformer models. arXiv preprint arXiv:2406.08552","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Hanling Zhang, Pu Lu, Xuefei Ning, Linfeng Zhang, Tianchen Zhao, Shengen Yan, Guohao Dai, and Yu Wang. 2024. Ditfastattn: Attention compression for diffusion transformer models. arXiv preprint arXiv:2406.08552 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Fast sampling of diffusion models with exponential integrator. arXiv preprint arXiv:2204.13902","author":"Zhang Qinsheng","year":"2022","unstructured":"Qinsheng Zhang and Yongxin Chen. 2022. Fast sampling of diffusion models with exponential integrator. arXiv preprint arXiv:2204.13902 (2022)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_2_1_56_1","volume-title":"Open-sora: Democratizing efficient video production for all. arXiv preprint arXiv:2412.20404","author":"Zheng Zangwei","year":"2024","unstructured":"Zangwei Zheng, Xiangyu Peng, Tianji Yang, Chenhui Shen, Shenggui Li, Hongxin Liu, Yukun Zhou, Tianyi Li, and Yang You. 2024. Open-sora: Democratizing efficient video production for all. arXiv preprint arXiv:2412.20404 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Designing a better asymmetric vqgan for stablediffusion. arXiv preprint arXiv:2306.04632","author":"Zhu Zixin","year":"2023","unstructured":"Zixin Zhu, Xuelu Feng, Dongdong Chen, Jianmin Bao, Le Wang, Yinpeng Chen, Lu Yuan, and Gang Hua. 2023. Designing a better asymmetric vqgan for stablediffusion. arXiv preprint arXiv:2306.04632 (2023)."}],"event":{"name":"MM '25: The 33rd ACM International Conference on Multimedia","location":"Dublin Ireland","acronym":"MM '25","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 33rd ACM International Conference on Multimedia"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3746027.3755638","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,10]],"date-time":"2025-12-10T04:59:30Z","timestamp":1765342770000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3746027.3755638"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,10,27]]},"references-count":57,"alternative-id":["10.1145\/3746027.3755638","10.1145\/3746027"],"URL":"https:\/\/doi.org\/10.1145\/3746027.3755638","relation":{},"subject":[],"published":{"date-parts":[[2025,10,27]]},"assertion":[{"value":"2025-10-27","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}