{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,13]],"date-time":"2026-06-13T16:04:04Z","timestamp":1781366644784,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":69,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3757377.3763831","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T16:27:29Z","timestamp":1765211249000},"page":"1-11","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Input-Aware Sparse Attention for Real-Time Co-Speech Video Generation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3113-9131","authenticated-orcid":false,"given":"Beijia","family":"Lu","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-4064-324X","authenticated-orcid":false,"given":"Ziyi","family":"Chen","sequence":"additional","affiliation":[{"name":"PAII Inc., Palo Alto, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9615-4749","authenticated-orcid":false,"given":"Jing","family":"Xiao","sequence":"additional","affiliation":[{"name":"PAII Inc., Palo Alto, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8504-3410","authenticated-orcid":false,"given":"Jun-Yan","family":"Zhu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_3_1_2_1","doi-asserted-by":"publisher","DOI":"10.52202\/079017-1873"},{"key":"e_1_3_3_1_3_1","unstructured":"Andreas Blattmann Tim Dockhorn Sumith Kulal Daniel Mendelevitch Maciej Kilian Dominik Lorenz Yam Levi Zion English Vikram Voleti Adam Letts Varun Jampani and Robin Rombach. 2023. Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.15127 (2023)."},{"key":"e_1_3_3_1_4_1","doi-asserted-by":"crossref","unstructured":"Boyuan Chen Diego Mart\u00ed\u00a0Mons\u00f3 Yilun Du Max Simchowitz Russ Tedrake and Vincent Sitzmann. 2025. Diffusion forcing: Next-token prediction meets full-sequence diffusion. Conference on Neural Information Processing Systems (NeurIPS) 37 (2025) 24081\u201324125.","DOI":"10.52202\/079017-0759"},{"key":"e_1_3_3_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00702"},{"key":"e_1_3_3_1_6_1","unstructured":"Rewon Child Scott Gray Alec Radford and Ilya Sutskever. 2019. Generating long sequences with sparse transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1904.10509 (2019)."},{"key":"e_1_3_3_1_7_1","unstructured":"Enric Corona Andrei Zanfir Eduard\u00a0Gabriel Bazavan Nikos Kolotouros Thiemo Alldieck and Cristian Sminchisescu. 2024. Vlogger: Multimodal diffusion for embodied avatar synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.08764 (2024)."},{"key":"e_1_3_3_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.01483"},{"key":"e_1_3_3_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00482"},{"key":"e_1_3_3_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2019.00038"},{"key":"e_1_3_3_1_11_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Franceschi Jean-Yves","year":"2023","unstructured":"Jean-Yves Franceschi, Mike Gartrell, Ludovic Dos\u00a0Santos, Thibaut Issenhuth, Emmanuel de Bezenac, Micka\u00ebl Chen, and Alain Rakotomamonjy. 2023. Unifying GANs and Score-Based Diffusion as Generative Particle Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00361"},{"key":"e_1_3_3_1_13_1","unstructured":"Ian\u00a0J Goodfellow Jean Pouget-Abadie Mehdi Mirza Bing Xu David Warde-Farley Sherjil Ozair Aaron Courville and Yoshua Bengio. 2014. Generative adversarial nets. Conference on Neural Information Processing Systems (NeurIPS) 27 (2014)."},{"key":"e_1_3_3_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687571"},{"key":"e_1_3_3_1_15_1","unstructured":"Junxian Guo Haotian Tang Shang Yang Zhekai Zhang Zhijian Liu and Song Han. 2024. Block Sparse Attention. https:\/\/github.com\/mit-han-lab\/Block-Sparse-Attention."},{"key":"e_1_3_3_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00220"},{"key":"e_1_3_3_1_17_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Heusel Martin","year":"2017","unstructured":"Martin Heusel, Hubert Ramsauer, Thomas Unterthiner, Bernhard Nessler, and Sepp Hochreiter. 2017. GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_18_1","unstructured":"Geoffrey Hinton Oriol Vinyals and Jeff Dean. 2015. Distilling the knowledge in a neural network. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1503.02531 (2015)."},{"key":"e_1_3_3_1_19_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. 2020. Denoising Diffusion Probabilistic Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_20_1","unstructured":"Jonathan Ho Tim Salimans Alexey Gritsenko William Chan Mohammad Norouzi and David\u00a0J Fleet. 2022. Video diffusion models. arXiv:https:\/\/arXiv.org\/abs\/2204.03458 (2022)."},{"key":"e_1_3_3_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW63382.2024.00198"},{"key":"e_1_3_3_1_22_1","unstructured":"Li Hu Xin Gao Peng Zhang Ke Sun Bang Zhang and Liefeng Bo. 2023. Animate Anyone: Consistent and Controllable Image-to-Video Synthesis for Character Animation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.17117 (2023)."},{"key":"e_1_3_3_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.632"},{"key":"e_1_3_3_1_24_1","unstructured":"Yang Jin Zhicheng Sun Ningyuan Li Kun Xu Hao Jiang Nan Zhuang Quzhe Huang Yang Song Yadong Mu and Zhouchen Lin. 2024. Pyramidal Flow Matching for Efficient Video Generative Modeling. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2410.05954 (2024)."},{"key":"e_1_3_3_1_25_1","first-page":"428","volume-title":"European Conference on Computer Vision","author":"Kang Minguk","year":"2024","unstructured":"Minguk Kang, Richard Zhang, Connelly Barnes, Sylvain Paris, Suha Kwak, Jaesik Park, Eli Shechtman, Jun-Yan Zhu, and Taesung Park. 2024. Distilling diffusion models into conditional gans. In European Conference on Computer Vision. Springer, 428\u2013447."},{"key":"e_1_3_3_1_26_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Kim Dongjun","year":"2024","unstructured":"Dongjun Kim, Chieh-Hsin Lai, Wei-Hsiang Liao, Naoki Murata, Yuhta Takida, Toshimitsu Uesaka, Yutong He, Yuki Mitsufuji, and Stefano Ermon. 2024. Consistency Trajectory Models: Learning Probability Flow ODE Trajectory of Diffusion. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_27_1","unstructured":"Gaojie Lin Jianwen Jiang Chao Liang Tianyun Zhong Jiaqi Yang and Yanbo Zheng. 2024. Cyberhost: Taming audio-driven avatar diffusion model with region codebook attention. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.01876 (2024)."},{"key":"e_1_3_3_1_28_1","unstructured":"Shanchuan Lin Xin Xia Yuxi Ren Ceyuan Yang Xuefeng Xiao and Lu Jiang. 2025. Diffusion adversarial post-training for one-step video generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.08316 (2025)."},{"key":"e_1_3_3_1_29_1","unstructured":"Shanchuan Lin and Xiao Yang. 2024. AnimateDiff-Lightning: Cross-Model Diffusion Distillation. arxiv:https:\/\/arXiv.org\/abs\/2403.12706"},{"key":"e_1_3_3_1_30_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Liu Haiyang","year":"2025","unstructured":"Haiyang Liu, Xingchao Yang, Tomoya Akiyama, Yuantian Huang, Qiaoge Li, Shigeru Kuriyama, and Takafumi Taketomi. 2025. TANGO: Co-Speech Gesture Video Reenactment with Hierarchical Audio Motion Embedding and Diffusion Interpolation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00115"},{"key":"e_1_3_3_1_32_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Liu Xian","year":"2022","unstructured":"Xian Liu, Qianyi Wu, Hang Zhou, Yuanqi Du, Wayne Wu, Dahua Lin, and Ziwei Liu. 2022. Audio-Driven Co-Speech Gesture Video Generation. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_33_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Liu Xingchao","year":"2024","unstructured":"Xingchao Liu, Xiwen Zhang, Jianzhu Ma, Jian Peng, and Qiang Liu. 2024a. InstaFlow: One Step is Enough for High-Quality Diffusion-based Text-to-Image Generation. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_34_1","unstructured":"Eric Luhman and Troy Luhman. 2021. Knowledge Distillation in Iterative Generative Models for Improved Sampling Speed. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2101.02388 (2021)."},{"key":"e_1_3_3_1_35_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Luo Weijian","year":"2023","unstructured":"Weijian Luo, Tianyang Hu, Shifeng Zhang, Jiacheng Sun, Zhenguo Li, and Zhihua Zhang. 2023. Diff-Instruct: A Universal Approach for Transferring Knowledge from Pre-Trained Diffusion Models. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_36_1","volume-title":"European Conference on Computer Vision (ECCV)","author":"Mahapatra Aniruddha","year":"2024","unstructured":"Aniruddha Mahapatra, Richa Mishra, Renda Li, Ziyi Chen, Boyang Ding, Shoulei Wang, Jun-Yan Zhu, Peng Chang, Mei Han, and Jing Xiao. 2024. Co-Speech Gesture Video Generation with 3D Human Meshes. In European Conference on Computer Vision (ECCV)."},{"key":"e_1_3_3_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.00516"},{"key":"e_1_3_3_1_38_1","unstructured":"Gaurav Parmar Taesung Park Srinivasa Narasimhan and Jun-Yan Zhu. 2024. One-step image translation with text-to-image models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.12036 (2024)."},{"key":"e_1_3_3_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51070.2023.00387"},{"key":"e_1_3_3_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413532"},{"key":"e_1_3_3_1_41_1","first-page":"8748","volume-title":"International Conference on Machine Learning (ICML)","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In International Conference on Machine Learning (ICML). 8748\u20138763."},{"key":"e_1_3_3_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00232"},{"key":"e_1_3_3_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00985"},{"key":"e_1_3_3_1_45_1","unstructured":"Tim Salimans and Jonathan Ho. 2022. Progressive distillation for fast sampling of diffusion models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2202.00512 (2022)."},{"key":"e_1_3_3_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3680528.3687625"},{"key":"e_1_3_3_1_47_1","first-page":"87","volume-title":"European Conference on Computer Vision (ECCV)","author":"Sauer Axel","year":"2024","unstructured":"Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach. 2024b. Adversarial diffusion distillation. In European Conference on Computer Vision (ECCV). Springer, 87\u2013103."},{"key":"e_1_3_3_1_48_1","doi-asserted-by":"crossref","unstructured":"Scott Schaefer Travis McPhail and Joe Warren. 2006. Image deformation using moving least squares. ACM Transactions on Graphics (TOG) 25 3 (2006) 533\u2013540.","DOI":"10.1145\/1141911.1141920"},{"key":"e_1_3_3_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00248"},{"key":"e_1_3_3_1_50_1","unstructured":"Aliaksandr Siarohin St\u00e9phane Lathuili\u00e8re Sergey Tulyakov Elisa Ricci and Nicu Sebe. 2019b. First order motion model for image animation. Conference on Neural Information Processing Systems (NeurIPS) 32 (2019)."},{"key":"e_1_3_3_1_51_1","first-page":"2256","volume-title":"International Conference on Learning Representations (ICLR)","author":"Sohl-Dickstein Jascha","year":"2015","unstructured":"Jascha Sohl-Dickstein, Eric Weiss, Niru Maheswaranathan, and Surya Ganguli. 2015. Deep unsupervised learning using nonequilibrium thermodynamics. In International Conference on Learning Representations (ICLR). pmlr, 2256\u20132265."},{"key":"e_1_3_3_1_52_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Song Jiaming","year":"2021","unstructured":"Jiaming Song, Chenlin Meng, and Stefano Ermon. 2021. Denoising Diffusion Implicit Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_1_53_1","unstructured":"Linrui Tian Siqi Hu Qi Wang Bang Zhang and Liefeng Bo. 2024. EMO2: End-Effector Guided Audio-Driven Avatar Video Generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.10687 (2024)."},{"key":"e_1_3_3_1_54_1","volume-title":"International Conference on Learning Representations (ICLR) Workshop","author":"Unterthiner Thomas","year":"2019","unstructured":"Thomas Unterthiner, Sjoerd van Steenkiste, Karol Kurach, Raphael Marinier, Marcin Michalski, and Sylvain Gelly. 2019. FVD: A new metric for video generation. In International Conference on Learning Representations (ICLR) Workshop."},{"key":"e_1_3_3_1_55_1","unstructured":"Aaron Van Den\u00a0Oord Oriol Vinyals et\u00a0al. 2017. Neural discrete representation learning. Conference on Neural Information Processing Systems (NeurIPS) 30 (2017)."},{"key":"e_1_3_3_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00872"},{"key":"e_1_3_3_1_57_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Wang Ting-Chun","year":"2018","unstructured":"Ting-Chun Wang, Ming-Yu Liu, Jun-Yan Zhu, Guilin Liu, Andrew Tao, Jan Kautz, and Bryan Catanzaro. 2018. Video-to-Video Synthesis. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00991"},{"key":"e_1_3_3_1_59_1","doi-asserted-by":"publisher","unstructured":"Zhou Wang A.\u00a0C. Bovik H.\u00a0R. Sheikh and E.\u00a0P. Simoncelli. 2004. Image Quality Assessment: From Error Visibility to Structural Similarity. IEEE Transactions on Image Processing 13 4 (2004) 600\u2013612. 10.1109\/TIP.2003.819861","DOI":"10.1109\/TIP.2003.819861"},{"key":"e_1_3_3_1_60_1","unstructured":"Zhengyi Wang Cheng Lu Yikai Wang Fan Bao Chongxuan Li Hang Su and Jun Zhu. 2023. Prolificdreamer: High-fidelity and diverse text-to-3d generation with variational score distillation. Conference on Neural Information Processing Systems (NeurIPS) 36 (2023) 8406\u20138441."},{"key":"e_1_3_3_1_61_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Xi Haocheng","year":"2025","unstructured":"Haocheng Xi, Shuo Yang, Yilong Zhao, Chenfeng Xu, Muyang Li, Xiuyu Li, Yujun Lin, Han Cai, Jintao Zhang, Dacheng Li, et\u00a0al. 2025. Sparse VideoGen: Accelerating Video Diffusion Transformers with Spatial-Temporal Sparsity. In International Conference on Machine Learning (ICML)."},{"key":"e_1_3_3_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCVW60793.2023.00455"},{"key":"e_1_3_3_1_63_1","volume-title":"Proceedings of the 2025 Conference on Machine Learning and Systems (MLSys)","author":"Ye Zihao","year":"2025","unstructured":"Zihao Ye, Lequn Chen, Ruihang Lai, Wuwei Lin, Yineng Zhang, Stephanie Wang, Tianqi Chen, Baris Kasikci, Vinod Grover, Arvind Krishnamurthy, and Luis Ceze. 2025. FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving. In Proceedings of the 2025 Conference on Machine Learning and Systems (MLSys)."},{"key":"e_1_3_3_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.00053"},{"key":"e_1_3_3_1_65_1","volume-title":"Conference on Neural Information Processing Systems (NeurIPS)","author":"Yin Tianwei","year":"2024","unstructured":"Tianwei Yin, Michael Gharbi, Taesung Park, Richard Zhang, Eli Shechtman, Fredo Durand, and William\u00a0T Freeman. 2024a. Improved Distribution Matching Distillation for Fast Image Synthesis. In Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_3_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00632"},{"key":"e_1_3_3_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02138"},{"key":"e_1_3_3_1_68_1","unstructured":"Yuanhao Zhai Kevin Lin Zhengyuan Yang Linjie Li Jianfeng Wang Chung-Ching Lin David Doermann Junsong Yuan and Lijuan Wang. 2024. Motion Consistency Model: Accelerating Video Diffusion with Disentangled Motion-Appearance Distillation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06890 (2024)."},{"key":"e_1_3_3_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00068"},{"key":"e_1_3_3_1_70_1","volume-title":"International Conference on Machine Learning (ICML)","author":"Zhang Yuang","year":"2025","unstructured":"Yuang Zhang, Jiaxi Gu, Li-Wen Wang, Han Wang, Junqi Cheng, Yuefeng Zhu, and Fangyuan Zou. 2025. MimicMotion: High-Quality Human Motion Video Generation with Confidence-aware Pose Guidance. In International Conference on Machine Learning (ICML)."}],"event":{"name":"SA Conference Papers '25: SIGGRAPH Asia 2025 Conference Papers","location":"Hong Kong Hong Kong","acronym":"SA Conference Papers '25","sponsor":["SIGGRAPH ACM Special Interest Group on Computer Graphics and Interactive Techniques"]},"container-title":["Proceedings of the SIGGRAPH Asia 2025 Conference Papers"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3757377.3763831","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T03:26:01Z","timestamp":1765250761000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3757377.3763831"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":69,"alternative-id":["10.1145\/3757377.3763831","10.1145\/3757377"],"URL":"https:\/\/doi.org\/10.1145\/3757377.3763831","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}