{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:56:18Z","timestamp":1781538978902,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":46,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T00:00:00Z","timestamp":1781481600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,6,16]]},"DOI":"10.1145\/3805622.3810814","type":"proceedings-article","created":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T14:42:57Z","timestamp":1781534577000},"page":"1626-1634","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["StepVAR: Structure-Texture Guided Pruning for Visual Autoregressive Models"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-1915-0248","authenticated-orcid":false,"given":"Keli","family":"Liu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7154-6478","authenticated-orcid":false,"given":"Zhendong","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1690-9836","authenticated-orcid":false,"given":"Wengang","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2188-3028","authenticated-orcid":false,"given":"Houqiang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2026,6,15]]},"reference":[{"key":"e_1_3_3_1_2_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Bolya Daniel","year":"2023","unstructured":"Daniel Bolya, Cheng-Yang Fu, Xiaoliang Dai, Peizhao Zhang, Christoph Feichtenhofer, and Judy Hoffman. 2023. Token Merging: Your ViT But Faster. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3731715.3733337"},{"key":"e_1_3_3_1_4_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Chen Junsong","year":"2024","unstructured":"Junsong Chen, Jincheng YU, Chongjian GE, Lewei Yao, Enze Xie, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li. 2024. PixArt-$\\alpha$: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01592"},{"key":"e_1_3_3_1_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.02173"},{"key":"e_1_3_3_1_7_2","unstructured":"Haoge Deng Ting Pan Haiwen Diao Zhengxiong Luo Yufeng Cui Huchuan Lu Shiguang Shan Yonggang Qi and Xinlong Wang. 2024. Autoregressive Video Generation without Vector Quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.14169 (2024)."},{"key":"e_1_3_3_1_8_2","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Esser Patrick","year":"2024","unstructured":"Patrick Esser, Sumith Kulal, Andreas Blattmann, et\u00a0al. 2024. Scaling Rectified Flow Transformers for High-Resolution Image Synthesis. In Proceedings of the International Conference on Machine Learning."},{"key":"e_1_3_3_1_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01268"},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"crossref","unstructured":"Dhruba Ghosh Hannaneh Hajishirzi and Ludwig Schmidt. 2023. Geneval: An object-focused framework for evaluating text-to-image alignment. Advances in Neural Information Processing Systems 36 (2023) 52132\u201352152.","DOI":"10.52202\/075280-2270"},{"key":"e_1_3_3_1_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV51701.2025.01767"},{"key":"e_1_3_3_1_12_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52734.2025.01467"},{"key":"e_1_3_3_1_13_2","unstructured":"Martin Heusel Hubert Ramsauer Thomas Unterthiner Bernhard Nessler and Sepp Hochreiter. 2017. Gans trained by a two time-scale update rule converge to a local nash equilibrium. Proceedings of the Advances in Neural Information Processing Systems 30 (2017)."},{"key":"e_1_3_3_1_14_2","unstructured":"Xiwei Hu Rui Wang Yixiao Fang Bin Fu Pei Cheng and Gang Yu. 2024. Ella: Equip diffusion models with llm for enhanced semantic alignment. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2403.05135 (2024)."},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02060"},{"key":"e_1_3_3_1_16_2","unstructured":"Weijie Kong Qi Tian Zijian Zhang Rox Min Zuozhuo Dai Jin Zhou Jiangfeng Xiong Xin Li Bo Wu Jianwei Zhang et\u00a0al. 2024. Hunyuanvideo: A systematic framework for large video generative models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.03603 (2024)."},{"key":"e_1_3_3_1_17_2","unstructured":"Daiqing Li Aleks Kamko Ehsan Akhgari Ali Sabet Linmiao Xu and Suhail Doshi. 2024. Playground v2. 5: Three insights towards enhancing aesthetic quality in text-to-image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2402.17245 (2024)."},{"key":"e_1_3_3_1_18_2","unstructured":"Han Li Xinyu Peng Yaoming Wang Zelin Peng Xin Chen Rongxiang Weng Jingang Wang Xunliang Cai Wenrui Dai and Hongkai Xiong. 2025. Onecat: Decoder-only auto-regressive model for unified understanding and generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2509.03498 (2025)."},{"key":"e_1_3_3_1_19_2","unstructured":"Kunjun Li Zigeng Chen Cheng-Yen Yang and Jenq-Neng Hwang. 2025. Memory-Efficient Visual Autoregressive Modeling with Scale-Aware KV Cache Compression. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.19602 (2025)."},{"key":"e_1_3_3_1_20_2","unstructured":"Senmao Li Taihang Hu Fahad\u00a0Shahbaz Khan Linxuan Li Shiqi Yang Yaxing Wang Ming-Ming Cheng and Jian Yang. 2023. Faster diffusion: Rethinking the role of unet encoder in diffusion models. CoRR (2023)."},{"key":"e_1_3_3_1_21_2","unstructured":"Tianhong Li Yonglong Tian He Li Mingyang Deng and Kaiming He. 2024. Autoregressive Image Generation without Vector Quantization. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.11838 (2024)."},{"key":"e_1_3_3_1_22_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Li Ying","unstructured":"Ying Li, Huan Wang, et\u00a0al. [n. d.]. FreqExit: Enabling Early-Exit Inference for Visual Autoregressive Models via Frequency-Aware Guidance. In The Thirty-ninth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_23_2","doi-asserted-by":"publisher","DOI":"10.1145\/3652583.3658071"},{"key":"e_1_3_3_1_24_2","volume-title":"The Thirty-ninth Annual Conference on Neural Information Processing Systems","author":"Liu Jinlai","year":"2025","unstructured":"Jinlai Liu, Jian Han, Bin Yan, Wuhui, Fengda Zhu, Xing Wang, Yi Jiang, BINGYUE PENG, and Zehuan Yuan. 2025. InfinityStar: Unified Spacetime AutoRegressive Modeling for Visual Generation. In The Thirty-ninth Annual Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_1_25_2","doi-asserted-by":"crossref","unstructured":"Cheng Lu Yuhao Zhou Fan Bao Jianfei Chen Chongxuan Li and Jun Zhu. 2022. Dpm-solver: A fast ode solver for diffusion probabilistic model sampling in around 10 steps. Advances in Neural Information Processing Systems 35 (2022) 5775\u20135787.","DOI":"10.52202\/068431-0418"},{"key":"e_1_3_3_1_26_2","unstructured":"Zhuoyan Luo Fengyuan Shi Yixiao Ge Yujiu Yang Limin Wang and Ying Shan. 2024. Open-MAGVIT2: An Open-Source Project Toward Democratizing Auto-regressive Visual Generation. CoRR abs\/2409.04410 (2024)."},{"key":"e_1_3_3_1_27_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01492"},{"key":"e_1_3_3_1_28_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Podell Dustin","year":"2024","unstructured":"Dustin Podell, Zion English, Kyle Lacey, et\u00a0al. 2024. SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_29_2","unstructured":"Yunpeng Qu Kun Yuan Jinhua Hao Kai Zhao Qizhi Xie Ming Sun and Chao Zhou. 2025. Visual Autoregressive Modeling for Image Super-Resolution. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.18993 (2025)."},{"key":"e_1_3_3_1_30_2","first-page":"8748","volume-title":"Proceedings of the International Conference on Machine Learning","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong\u00a0Wook Kim, Chris Hallacy, et\u00a0al. 2021. Learning transferable visual models from natural language supervision. In Proceedings of the International Conference on Machine Learning. PmLR, 8748\u20138763."},{"key":"e_1_3_3_1_31_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_3_1_32_2","first-page":"87","volume-title":"European Conference on Computer Vision","author":"Sauer Axel","year":"2024","unstructured":"Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach. 2024. Adversarial diffusion distillation. In European Conference on Computer Vision. Springer, 87\u2013103."},{"key":"e_1_3_3_1_33_2","unstructured":"Peize Sun Yi Jiang Shoufa Chen Shilong Zhang Bingyue Peng Ping Luo and Zehuan Yuan. 2024. Autoregressive model beats diffusion: Llama for scalable image generation. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.06525 (2024)."},{"key":"e_1_3_3_1_34_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Tang Haotian","year":"2025","unstructured":"Haotian Tang, Yecheng Wu, Shang Yang, Enze Xie, Junsong Chen, Junyu Chen, Zhuoyang Zhang, Han Cai, Yao Lu, and Song Han. 2025. HART: Efficient Visual Generation with Hybrid Autoregressive Transformer. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_35_2","doi-asserted-by":"crossref","unstructured":"Keyu Tian Yi Jiang Zehuan Yuan Bingyue Peng and Liwei Wang. 2024. Visual autoregressive modeling: Scalable image generation via next-scale prediction. Proceedings of the Advances in Neural Information Processing Systems 37 (2024) 84839\u201384865.","DOI":"10.52202\/079017-2694"},{"key":"e_1_3_3_1_36_2","volume-title":"Proceedings of the Advances in Neural Information Processing Systems","volume":"30","author":"Oord Aaron van\u00a0den","year":"2017","unstructured":"Aaron van\u00a0den Oord, Oriol Vinyals, and koray kavukcuoglu. 2017. Neural Discrete Representation Learning. In Proceedings of the Advances in Neural Information Processing Systems , Vol.\u00a030."},{"key":"e_1_3_3_1_37_2","unstructured":"Anton Voronov Denis Kuznedelev Mikhail Khoroshikh Valentin Khrulkov and Dmitry Baranchuk. 2024. Switti: Designing scale-wise transformers for text-to-image synthesis. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2412.01819 (2024)."},{"key":"e_1_3_3_1_38_2","first-page":"7559","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Wang Siyang","year":"2025","unstructured":"Siyang Wang, Naishan Zheng, Jie Huang, and Feng Zhao. 2025. Navigating Image Restoration with VAR\u2019s Distribution Alignment Prior. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition. 7559\u20137569."},{"key":"e_1_3_3_1_39_2","unstructured":"Xinlong Wang Xiaosong Zhang Zhengxiong Luo Quan Sun Yufeng Cui Jinsheng Wang Fan Zhang Yueze Wang Zhen Li Qiying Yu et\u00a0al. 2024. Emu3: Next-token prediction is all you need. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.18869 (2024)."},{"key":"e_1_3_3_1_40_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Xie Jinheng","year":"2025","unstructured":"Jinheng Xie, Weijia Mao, Zechen Bai, et\u00a0al. 2025. Show-o: One Single Transformer to Unify Multimodal Understanding and Generation. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_41_2","unstructured":"Zhuoyi Yang Jiayan Teng Wendi Zheng Ming Ding Shiyu Huang Jiazheng Xu Yuanming Yang Wenyi Hong Xiaohan Zhang Guanyu Feng et\u00a0al. 2024. Cogvideox: Text-to-video diffusion models with an expert transformer. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2408.06072 (2024)."},{"key":"e_1_3_3_1_42_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00632"},{"key":"e_1_3_3_1_43_2","unstructured":"Yuechen Zhang Jinbo Xing Bin Xia Shaoteng Liu Bohao Peng Xin Tao Pengfei Wan Eric Lo and Jiaya Jia. 2025. Training-Free Efficient Video Generation via Dynamic Token Carving. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2505.16864 (2025)."},{"key":"e_1_3_3_1_44_2","doi-asserted-by":"crossref","unstructured":"Wenliang Zhao Lujia Bai Yongming Rao Jie Zhou and Jiwen Lu. 2023. Unipc: A unified predictor-corrector framework for fast sampling of diffusion models. Advances in Neural Information Processing Systems 36 (2023) 49842\u201349869.","DOI":"10.52202\/075280-2170"},{"key":"e_1_3_3_1_45_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Zhao Wangbo","year":"2025","unstructured":"Wangbo Zhao, Yizeng Han, Jiasheng Tang, Kai Wang, Yibing Song, Gao Huang, Fan Wang, and Yang You. 2025. Dynamic Diffusion Transformer. In Proceedings of the International Conference on Learning Representations."},{"key":"e_1_3_3_1_46_2","unstructured":"Xianwei Zhuang Yuxin Xie Yufan Deng Liming Liang Jinghan Ru Yuguo Yin and Yuexian Zou. 2025. Vargpt: Unified understanding and generation in a visual autoregressive multimodal large language model. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2501.12327 (2025)."},{"key":"e_1_3_3_1_47_2","volume-title":"Proceedings of the International Conference on Learning Representations","author":"Zou Chang","year":"2025","unstructured":"Chang Zou, Xuyang Liu, Ting Liu, Siteng Huang, and Linfeng Zhang. 2025. Accelerating Diffusion Transformers with Token-wise Feature Caching. In Proceedings of the International Conference on Learning Representations."}],"event":{"name":"ICMR '26: International Conference on Multimedia Retrieval","location":"Amsterdam The Netherlands","acronym":"ICMR '26","sponsor":["SIGMM ACM Special Interest Group on Multimedia"]},"container-title":["Proceedings of the 2026 International Conference on Multimedia Retrieval"],"original-title":[],"deposited":{"date-parts":[[2026,6,15]],"date-time":"2026-06-15T15:29:40Z","timestamp":1781537380000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3805622.3810814"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,6,15]]},"references-count":46,"alternative-id":["10.1145\/3805622.3810814","10.1145\/3805622"],"URL":"https:\/\/doi.org\/10.1145\/3805622.3810814","relation":{},"subject":[],"published":{"date-parts":[[2026,6,15]]},"assertion":[{"value":"2026-06-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}