{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:10:22Z","timestamp":1765465822916,"version":"3.48.0"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","funder":[{"name":"Research Grants Council, University Grants Committee","award":["C7004-22G, 14212425"],"award-info":[{"award-number":["C7004-22G, 14212425"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,3,22]]},"DOI":"10.1145\/3760250.3762216","type":"proceedings-article","created":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:06:36Z","timestamp":1765465596000},"page":"101-116","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Dynamic Sparsity in Large-Scale Video DiT Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3785-9700","authenticated-orcid":false,"given":"Xin","family":"Tan","sequence":"first","affiliation":[{"name":"Computer Science and Engineering, The Chinese University of Hong Kong, Shatin, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-3818-4753","authenticated-orcid":false,"given":"Yuetao","family":"Chen","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, The Chinese University of Hong Kong, Shatin, Hong Kong"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0049-873X","authenticated-orcid":false,"given":"Yimin","family":"Jiang","sequence":"additional","affiliation":[{"name":"Independent, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6453-5026","authenticated-orcid":false,"given":"Xing","family":"Chen","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8290-5169","authenticated-orcid":false,"given":"Kun","family":"Yan","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3387-4674","authenticated-orcid":false,"given":"Nan","family":"Duan","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9113-2660","authenticated-orcid":false,"given":"Yibo","family":"Zhu","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6657-5806","authenticated-orcid":false,"given":"Daxin","family":"Jiang","sequence":"additional","affiliation":[{"name":"StepFun, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9359-9571","authenticated-orcid":false,"given":"Hong","family":"Xu","sequence":"additional","affiliation":[{"name":"Computer Science and Engineering, The Chinese University of Hong Kong, Shatin, Hong Kong"}]}],"member":"320","published-online":{"date-parts":[[2025,12,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2024. CLIP. https:\/\/github.com\/openai\/CLIP."},{"key":"e_1_3_2_1_2_1","unstructured":"2024. FusedAttention. https:\/\/triton-lang.org\/main\/getting-started\/tutorials\/06-fused-attention.html."},{"key":"e_1_3_2_1_3_1","unstructured":"2024. HunyuanVideo. https:\/\/github.com\/Tencent\/HunyuanVideo."},{"key":"e_1_3_2_1_4_1","unstructured":"2024. kling. https:\/\/kling.kuaishou.com\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2024. Llama 3.1. https:\/\/ai.meta.com\/blog\/meta-llama-3--1\/."},{"key":"e_1_3_2_1_6_1","unstructured":"2024. Mistral-7B. https:\/\/mistral.ai\/news\/announcing-mistral-7b\/."},{"key":"e_1_3_2_1_7_1","unstructured":"2024. Open-Sora. https:\/\/github.com\/hpcaitech\/Open-Sora."},{"key":"e_1_3_2_1_8_1","unstructured":"2024. Openai Triton. https:\/\/github.com\/triton-lang\/triton."},{"key":"e_1_3_2_1_9_1","unstructured":"2024. Sora. https:\/\/openai.com\/sora\/."},{"key":"e_1_3_2_1_10_1","unstructured":"2024. stabilityai-stable-diffusion-2--1-base. https:\/\/huggingface.co\/stabilityai\/stable-diffusion-2--1-base\/tree\/main."},{"key":"e_1_3_2_1_11_1","unstructured":"2024. T5_v1_1-xxl. https:\/\/huggingface.co\/google\/t5-v1_1-xxl."},{"key":"e_1_3_2_1_12_1","volume-title":"Proc. IEEE\/CVF ICCV.","author":"Bain Max","year":"2021","unstructured":"Max Bain, Arsha Nagrani, G\u00fcl Varol, and Andrew Zisserman. 2021. Frozen in Time: A Joint Video and Image Encoder for End-to-End Retrieval. In Proc. IEEE\/CVF ICCV."},{"key":"e_1_3_2_1_13_1","volume-title":"Vidu: a highly consistent, dynamic and skilled text-to-video generator with diffusion models. arXiv preprint arXiv:2405.04233","author":"Bao Fan","year":"2024","unstructured":"Fan Bao, Chendong Xiang, Gang Yue, Guande He, Hongzhou Zhu, Kaiwen Zheng, Min Zhao, Shilong Liu, YaoleWang, and Jun Zhu. 2024. Vidu: a highly consistent, dynamic and skilled text-to-video generator with diffusion models. arXiv preprint arXiv:2405.04233 (2024)."},{"key":"e_1_3_2_1_14_1","unstructured":"Junsong Chen Jincheng Yu Chongjian Ge Lewei Yao Enze Xie Yue Wu Zhongdao Wang James Kwok Ping Luo Huchuan Lu et al. 2023. Pixartalpha : Fast training of diffusion transformer for photorealistic text-to-image synthesis. arXiv preprint arXiv:2310.00426 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. 2016. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 (2016)."},{"key":"e_1_3_2_1_16_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)."},{"key":"e_1_3_2_1_17_1","volume-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness. In Advances in Neural Information Processing Systems.","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. 2022. Flashattention: Fast and memory-efficient exact attention with io-awareness. In Advances in Neural Information Processing Systems."},{"key":"e_1_3_2_1_18_1","volume-title":"An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2020","unstructured":"Alexey Dosovitskiy. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_19_1","volume-title":"Proc. ICML.","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, Rahim Entezari, Jonas M\u00fcller, Harry Saini, Yam Levi, Dominik Lorenz, Axel Sauer, Frederic Boesel, et al. 2024. Scaling rectified flow transformers for highresolution image synthesis. In Proc. ICML."},{"key":"e_1_3_2_1_20_1","volume-title":"Usp: A unified sequence parallelism approach for long context generative ai. arXiv preprint arXiv:2405.07719","author":"Fang Jiarui","year":"2024","unstructured":"Jiarui Fang and Shangchun Zhao. 2024. Usp: A unified sequence parallelism approach for long context generative ai. arXiv preprint arXiv:2405.07719 (2024)."},{"key":"e_1_3_2_1_21_1","volume-title":"Ting Cao, Fan Yang, and Mao Yang.","author":"Gao Yizhao","year":"2024","unstructured":"Yizhao Gao, Zhichen Zeng, Dayou Du, Shijie Cao, Hayden Kwok-Hay So, Ting Cao, Fan Yang, and Mao Yang. 2024. Seerattention: Learning intrinsic sparse attention in your llms. arXiv preprint arXiv:2410.13276 (2024)."},{"key":"e_1_3_2_1_22_1","volume-title":"Loongtrain: Efficient training of long-sequence llms with head-context parallelism. arXiv preprint arXiv:2406.18485","author":"Gu Diandian","year":"2024","unstructured":"Diandian Gu, Peng Sun, Qinghao Hu, Ting Huang, Xun Chen, Yingtong Xiong, Guoteng Wang, Qiaoling Chen, Shangchun Zhao, Jiarui Fang, et al. 2024. Loongtrain: Efficient training of long-sequence llms with head-context parallelism. arXiv preprint arXiv:2406.18485 (2024)."},{"key":"e_1_3_2_1_23_1","volume-title":"Proc. IEEE\/CVF CVPR.","author":"Hassani Ali","year":"2023","unstructured":"Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi. 2023. Neighborhood attention transformer. In Proc. IEEE\/CVF CVPR."},{"key":"e_1_3_2_1_24_1","volume-title":"Proc. NeurIPS.","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising diffusion probabilistic models. In Proc. NeurIPS."},{"key":"e_1_3_2_1_25_1","volume-title":"Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598","author":"Ho Jonathan","year":"2022","unstructured":"Jonathan Ho and Tim Salimans. 2022. Classifier-free diffusion guidance. arXiv preprint arXiv:2207.12598 (2022)."},{"key":"e_1_3_2_1_26_1","volume-title":"Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868","author":"Hong Wenyi","year":"2022","unstructured":"Wenyi Hong, Ming Ding, Wendi Zheng, Xinghan Liu, and Jie Tang. 2022. Cogvideo: Large-scale pretraining for text-to-video generation via transformers. arXiv preprint arXiv:2205.15868 (2022)."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_2_1_28_1","volume-title":"Samyam Rajbhandari, and Yuxiong He.","author":"Jacobs Sam Ade","year":"2023","unstructured":"Sam Ade Jacobs, Masahiro Tanaka, Chengming Zhang, Minjia Zhang, Shuaiwen Leon Song, Samyam Rajbhandari, and Yuxiong He. 2023. Deepspeed ulysses: System optimizations for enabling training of extreme long sequence transformer models. arXiv preprint arXiv:2309.14509 (2023)."},{"key":"e_1_3_2_1_29_1","unstructured":"Huiqiang Jiang Yucheng Li Chengruidong Zhang Qianhui Wu Xufang Luo Surin Ahn Zhenhua Han Amir H Abdi Dongsheng Li Chin-Yew Lin et al. 2024. Minference 1.0: Accelerating pre-filling for long-context llms via dynamic sparse attention. arXiv preprint arXiv:2407.02490 (2024)."},{"key":"e_1_3_2_1_30_1","volume-title":"Moh: Multi-head attention as mixture-of-head attention. arXiv preprint arXiv:2410.11842","author":"Jin Peng","year":"2024","unstructured":"Peng Jin, Bo Zhu, Li Yuan, and Shuicheng Yan. 2024. Moh: Multi-head attention as mixture-of-head attention. arXiv preprint arXiv:2410.11842 (2024)."},{"key":"e_1_3_2_1_31_1","volume-title":"Adaptive caching for faster video generation with diffusion transformers. arXiv preprint arXiv:2411.02397","author":"Kahatapitiya Kumara","year":"2024","unstructured":"Kumara Kahatapitiya, Haozhe Liu, Sen He, Ding Liu, Menglin Jia, Michael S Ryoo, and Tian Xie. 2024. Adaptive caching for faster video generation with diffusion transformers. arXiv preprint arXiv:2411.02397 (2024)."},{"key":"e_1_3_2_1_32_1","volume-title":"Information aggregation for multi-head attention with routing-by-agreement. arXiv preprint arXiv:1904.03100","author":"Li Jian","year":"2019","unstructured":"Jian Li, Baosong Yang, Zi-Yi Dou, Xing Wang, Michael R Lyu, and Zhaopeng Tu. 2019. Information aggregation for multi-head attention with routing-by-agreement. arXiv preprint arXiv:1904.03100 (2019)."},{"key":"e_1_3_2_1_33_1","volume-title":"Heli Ben-Hamu, Maximilian Nickel, and Matt Le.","author":"Lipman Yaron","year":"2022","unstructured":"Yaron Lipman, Ricky TQ Chen, Heli Ben-Hamu, Maximilian Nickel, and Matt Le. 2022. Flow matching for generative modeling. arXiv preprint arXiv:2210.02747 (2022)."},{"key":"e_1_3_2_1_34_1","volume-title":"Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889","author":"Liu Hao","year":"2023","unstructured":"Hao Liu, Matei Zaharia, and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889 (2023)."},{"key":"e_1_3_2_1_35_1","volume-title":"Proc. ECCV.","author":"Ma Nanye","year":"2024","unstructured":"Nanye Ma, Mark Goldstein, Michael S Albergo, Nicholas M Boffi, Eric Vanden-Eijnden, and Saining Xie. 2024. Sit: Exploring flow and diffusion-based generative models with scalable interpolant transformers. In Proc. ECCV."},{"key":"e_1_3_2_1_36_1","volume-title":"Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048","author":"Ma Xin","year":"2024","unstructured":"Xin Ma, Yaohui Wang, Gengyun Jia, Xinyuan Chen, Ziwei Liu, Yuan-Fang Li, Cunjian Chen, and Yu Qiao. 2024. Latte: Latent diffusion transformer for video generation. arXiv preprint arXiv:2401.03048 (2024)."},{"key":"e_1_3_2_1_37_1","volume-title":"Openvid-1m: A large-scale high-quality dataset for text-to-video generation. arXiv preprint arXiv:2407.02371","author":"Nan Kepan","year":"2024","unstructured":"Kepan Nan, Rui Xie, Penghao Zhou, Tiehan Fan, Zhenheng Yang, Zhijie Chen, Xiang Li, Jian Yang, and Ying Tai. 2024. Openvid-1m: A large-scale high-quality dataset for text-to-video generation. arXiv preprint arXiv:2407.02371 (2024)."},{"key":"e_1_3_2_1_38_1","unstructured":"OpenAI. 2023. GPT-4 Technical Report. arXiv preprint arXiv:2303.08774 (2023)."},{"key":"e_1_3_2_1_39_1","volume-title":"Proc. IEEE\/CVF ICCV.","author":"Peebles William","year":"2023","unstructured":"William Peebles and Saining Xie. 2023. Scalable diffusion models with transformers. In Proc. IEEE\/CVF ICCV."},{"key":"e_1_3_2_1_40_1","unstructured":"Adam Polyak Amit Zohar Andrew Brown Andros Tjandra Animesh Sinha Ann Lee Apoorv Vyas Bowen Shi Chih-Yao Ma Ching-Yao Chuang et al. 2024. Movie gen: A cast of media foundation models. arXiv preprint arXiv:2410.13720 (2024)."},{"key":"e_1_3_2_1_41_1","volume-title":"Proc. ECCV.","author":"Pu Yifan","year":"2024","unstructured":"Yifan Pu, Zhuofan Xia, Jiayi Guo, Dongchen Han, Qixiu Li, Duo Li, Yuhui Yuan, Ji Li, Yizeng Han, Shiji Song, et al. 2024. Efficient diffusion transformer with step-wise dynamic attention mediators. In Proc. ECCV."},{"key":"e_1_3_2_1_42_1","volume-title":"UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402","author":"Soomro K","year":"2012","unstructured":"K Soomro. 2012. UCF101: A dataset of 101 human actions classes from videos in the wild. arXiv preprint arXiv:1212.0402 (2012)."},{"key":"e_1_3_2_1_43_1","volume-title":"Unveiling Redundancy in Diffusion Transformers (DiTs): A Systematic Study. arXiv preprint arXiv:2411.13588","author":"Sun Xibo","year":"2024","unstructured":"Xibo Sun, Jiarui Fang, Aoyu Li, and Jinzhe Pan. 2024. Unveiling Redundancy in Diffusion Transformers (DiTs): A Systematic Study. arXiv preprint arXiv:2411.13588 (2024)."},{"key":"e_1_3_2_1_44_1","volume-title":"Vidgen-1m: A large-scale dataset for text-to-video generation. arXiv preprint arXiv:2408.02629","author":"Tan Zhiyu","year":"2024","unstructured":"Zhiyu Tan, Xiaomeng Yang, Luozheng Qin, and Hao Li. 2024. Vidgen-1m: A large-scale dataset for text-to-video generation. arXiv preprint arXiv:2408.02629 (2024)."},{"key":"e_1_3_2_1_45_1","volume-title":"Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly.","author":"Unterthiner Thomas","year":"2018","unstructured":"Thomas Unterthiner, Sjoerd Van Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2018. Towards accurate generative models of video: A new metric & challenges. arXiv preprint arXiv:1812.01717 (2018)."},{"key":"e_1_3_2_1_46_1","volume-title":"Analyzing the structure of attention in a transformer language model. arXiv preprint arXiv:1906.04284","author":"Vig Jesse","year":"2019","unstructured":"Jesse Vig and Yonatan Belinkov. 2019. Analyzing the structure of attention in a transformer language model. arXiv preprint arXiv:1906.04284 (2019)."},{"key":"e_1_3_2_1_47_1","volume-title":"Wan: Open and Advanced Large-Scale Video Generative Models. arXiv preprint arXiv:2503.20314","author":"Wan Team","year":"2025","unstructured":"Team Wan, Ang Wang, Baole Ai, Bin Wen, Chaojie Mao, Chen-Wei Xie, Di Chen, Feiwu Yu, Haiming Zhao, Jianxiao Yang, Jianyuan Zeng, Jiayu Wang, Jingfeng Zhang, Jingren Zhou, Jinkai Wang, Jixuan Chen, Kai Zhu, Kang Zhao, Keyu Yan, Lianghua Huang, Mengyang Feng, Ningyi Zhang, Pandeng Li, Pingyu Wu, Ruihang Chu, Ruili Feng, Shiwei Zhang, Siyang Sun, Tao Fang, Tianxing Wang, Tianyi Gui, Tingyu Weng, Tong Shen, Wei Lin, Wei Wang, Wei Wang, Wenmeng Zhou, Wente Wang, Wenting Shen, Wenyuan Yu, Xianzhong Shi, Xiaoming Huang, Xin Xu, Yan Kou, Yangyu Lv, Yifei Li, Yijing Liu, Yiming Wang, Yingya Zhang, Yitong Huang, Yong Li, You Wu, Yu Liu, Yulin Pan, Yun Zheng, Yuntao Hong, Yupeng Shi, Yutong Feng, Zeyinzi Jiang, Zhen Han, Zhi-Fan Wu, and Ziyu Liu. 2025. Wan: Open and Advanced Large-Scale Video Generative Models. arXiv preprint arXiv:2503.20314 (2025)."},{"key":"e_1_3_2_1_48_1","volume-title":"Qihoo-t2x: An efficiency-focused diffusion transformer via proxy tokens for text-to-any-task. arXiv e-prints","author":"Wang Jing","year":"2024","unstructured":"Jing Wang, Ao Ma, Jiasong Feng, Dawei Leng, Yuhui Yin, and Xiaodan Liang. 2024. Qihoo-t2x: An efficiency-focused diffusion transformer via proxy tokens for text-to-any-task. arXiv e-prints (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"Proc. USENIX ATC.","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Guyue Huang, and Yufei Ding. 2023. {TC-GNN}: Bridging sparse {GNN} computation and dense tensor cores on {GPUs}. In Proc. USENIX ATC."},{"key":"e_1_3_2_1_50_1","unstructured":"A Waswani N Shazeer N Parmar J Uszkoreit L Jones A Gomez L Kaiser and I Polosukhin. 2017. Attention is all you need. In NIPS."},{"key":"e_1_3_2_1_51_1","unstructured":"Haocheng Xi Shuo Yang Yilong Zhao Chenfeng Xu Muyang Li Xiuyu Li Yujun Lin Han Cai Jintao Zhang Dacheng Li et al. 2025. Sparse VideoGen: Accelerating Video Diffusion Transformers with Spatial-Temporal Sparsity. arXiv preprint arXiv:2502.01776 (2025)."},{"key":"e_1_3_2_1_52_1","volume-title":"Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Yuandong Tian, Beidi Chen, Song Han, and Mike Lewis. 2023. Efficient streaming language models with attention sinks. arXiv preprint arXiv:2309.17453 (2023)."},{"key":"e_1_3_2_1_53_1","volume-title":"EasyAnimate: A High-Performance Long Video Generation Method based on Transformer Architecture. arXiv preprint arXiv:2405.18991","author":"Xu Jiaqi","year":"2024","unstructured":"Jiaqi Xu, Xinyi Zou, Kunzhe Huang, Yunkuo Chen, Bo Liu, MengLi Cheng, Xing Shi, and Jun Huang. 2024. EasyAnimate: A High-Performance Long Video Generation Method based on Transformer Architecture. arXiv preprint arXiv:2405.18991 (2024)."},{"key":"e_1_3_2_1_54_1","volume-title":"Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072","author":"Yang Zhuoyi","year":"2024","unstructured":"Zhuoyi Yang, Jiayan Teng, Wendi Zheng, Ming Ding, Shiyu Huang, Jiazheng Xu, Yuanming Yang, Wenyi Hong, Xiaohan Zhang, Guanyu Feng, et al. 2024. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:2408.06072 (2024)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"crossref","unstructured":"Jingyang Yuan Huazuo Gao Damai Dai Junyu Luo Liang Zhao Zhengyan Zhang Zhenda Xie YX Wei Lean Wang Zhiping Xiao et al. 2025. Native Sparse Attention: Hardware-Aligned and Natively Trainable Sparse Attention. arXiv preprint arXiv:2502.11089 (2025).","DOI":"10.18653\/v1\/2025.acl-long.1126"},{"key":"e_1_3_2_1_56_1","volume-title":"DiTFastAttn: Attention Compression for Diffusion Transformer Models. arXiv preprint arXiv:2406.08552","author":"Yuan Zhihang","year":"2024","unstructured":"Zhihang Yuan, Hanling Zhang, Pu Lu, Xuefei Ning, Linfeng Zhang, Tianchen Zhao, Shengen Yan, Guohao Dai, and Yu Wang. 2024. DiTFastAttn: Attention Compression for Diffusion Transformer Models. arXiv preprint arXiv:2406.08552 (2024)."},{"key":"e_1_3_2_1_57_1","volume-title":"Fast Video Generation with Sliding Tile Attention. arXiv preprint arXiv:2502.04507","author":"Zhang Peiyuan","year":"2025","unstructured":"Peiyuan Zhang, Yongqi Chen, Runlong Su, Hangliang Ding, Ion Stoica, Zhenghong Liu, and Hao Zhang. 2025. Fast Video Generation with Sliding Tile Attention. arXiv preprint arXiv:2502.04507 (2025)."},{"key":"e_1_3_2_1_58_1","volume-title":"Proc. NeurIPS.","author":"Zhang Zhenyu","year":"2023","unstructured":"Zhenyu Zhang, Ying Sheng, Tianyi Zhou, Tianlong Chen, Lianmin Zheng, Ruisi Cai, Zhao Song, Yuandong Tian, Christopher R\u00e9, Clark Barrett, et al. 2023. H2o: Heavy-hitter oracle for efficient generative inference of large language models. In Proc. NeurIPS."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer et al. 2023. Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277 (2023).","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_2_1_60_1","volume-title":"Proc. IEEE\/CVF CVPR.","author":"Zhu Lianghui","year":"2025","unstructured":"Lianghui Zhu, Zilong Huang, Bencheng Liao, Jun Hao Liew, Hanshu Yan, Jiashi Feng, and XinggangWang. 2025. Dig: Scalable and efficient diffusion models with gated linear attention. In Proc. IEEE\/CVF CVPR."}],"event":{"name":"ASPLOS '26:31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Pittsburgh PA USA","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 31st ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3760250.3762216","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,11]],"date-time":"2025-12-11T15:07:43Z","timestamp":1765465663000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3760250.3762216"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,11]]},"references-count":60,"alternative-id":["10.1145\/3760250.3762216","10.1145\/3760250"],"URL":"https:\/\/doi.org\/10.1145\/3760250.3762216","relation":{},"subject":[],"published":{"date-parts":[[2025,12,11]]},"assertion":[{"value":"2025-12-11","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}