{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,21]],"date-time":"2025-11-21T11:32:42Z","timestamp":1763724762532,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":20,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,27]],"date-time":"2024-10-27T00:00:00Z","timestamp":1729987200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,27]]},"DOI":"10.1145\/3676536.3676695","type":"proceedings-article","created":{"date-parts":[[2025,4,9]],"date-time":"2025-04-09T13:26:26Z","timestamp":1744205186000},"page":"1-8","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["ProPD: Dynamic Token Tree Pruning and Generation for LLM Parallel Decoding"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-5478-3604","authenticated-orcid":false,"given":"Shuzhang","family":"Zhong","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuit, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-0672-6389","authenticated-orcid":false,"given":"Zebin","family":"Yang","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Bejing, China"},{"name":"School of Integrated Circuit, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6024-7086","authenticated-orcid":false,"given":"Ruihao","family":"Gong","sequence":"additional","affiliation":[{"name":"Sensetime Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7514-0767","authenticated-orcid":false,"given":"Runsheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wuxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8146-4821","authenticated-orcid":false,"given":"Ru","family":"Huang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Institute of Electronic Design Automation, Peking University, Wxi, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7212-2264","authenticated-orcid":false,"given":"Meng","family":"Li","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"},{"name":"School of Integrated Circuit, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,4,9]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Fast and Robust Early-Exiting Framework for Autoregressive Language Models with Synchronized Parallel Decoding. arXiv preprint arXiv:2310.05424","author":"Bae Sangmin","year":"2023","unstructured":"Sangmin Bae, Jongwoo Ko, Hwanjun Song, and Se-Young Yun. 2023. Fast and Robust Early-Exiting Framework for Autoregressive Language Models with Synchronized Parallel Decoding. arXiv preprint arXiv:2310.05424 (2023)."},{"key":"e_1_3_2_1_2_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_3_1","volume-title":"Medusa: Simple framework for accelerating llm generation with multiple decoding heads.","author":"Cai Tianle","year":"2023","unstructured":"Tianle Cai, Yuhong Li, Zhengyang Geng, Hongwu Peng, and Tri Dao. 2023. Medusa: Simple framework for accelerating llm generation with multiple decoding heads."},{"key":"e_1_3_2_1_4_1","volume-title":"Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318","author":"Chen Charlie","year":"2023","unstructured":"Charlie Chen, Sebastian Borgeaud, Geoffrey Irving, Jean-Baptiste Lespiau, Laurent Sifre, and John Jumper. 2023. Accelerating large language model decoding with speculative sampling. arXiv preprint arXiv:2302.01318 (2023)."},{"key":"e_1_3_2_1_5_1","volume-title":"Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv preprint arXiv:1906.01749","author":"Fabbri Alexander R","year":"2019","unstructured":"Alexander R Fabbri, Irene Li, Tianwei She, Suyi Li, and Dragomir R Radev. 2019. Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv preprint arXiv:1906.01749 (2019)."},{"key":"e_1_3_2_1_6_1","volume-title":"Mohamed Afify, and Hany Hassan Awadalla.","author":"Hendy Amr","year":"2023","unstructured":"Amr Hendy, Mohamed Abdelrehim, Amr Sharaf, Vikas Raunak, Mohamed Gabr, Hitokazu Matsushita, Young Jin Kim, Mohamed Afify, and Hany Hassan Awadalla. 2023. How good are gpt models at machine translation? a comprehensive evaluation. arXiv preprint arXiv:2302.09210 (2023)."},{"key":"e_1_3_2_1_7_1","unstructured":"Yufan Jiang Qiaozhi He Xiaomin Zhuang Zhihua Wu Kunpeng Wang Wenlai Zhao and Guangwen Yang. 2023. RecycleGPT: An Autoregressive Language Model with Recyclable Module. arXiv:2308.03421 [cs.CL]"},{"key":"e_1_3_2_1_8_1","unstructured":"Sehoon Kim Coleman Hooper Thanakul Wattanawong Minwoo Kang Ruohan Yan Hasan Genc Grace Dinh Qijing Huang Kurt Keutzer Michael W Mahoney et al. 2023. Full stack optimization of transformer inference: a survey. arXiv preprint arXiv:2302.14017 (2023)."},{"key":"e_1_3_2_1_9_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Leviathan Yaniv","year":"2023","unstructured":"Yaniv Leviathan, Matan Kalman, and Yossi Matias. 2023. Fast inference from transformers via speculative decoding. In International Conference on Machine Learning. PMLR, 19274--19286."},{"key":"e_1_3_2_1_10_1","volume-title":"Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia.","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Zeyu Wang, Rae Ying Yee Wong, Zhuoming Chen, Daiyaan Arfeen, Reyna Abhyankar, and Zhihao Jia. 2023. SpecInfer: Accelerating Generative LLM Serving with Speculative Inference and Token Tree Verification. arXiv preprint arXiv:2305.09781 (2023)."},{"volume-title":"d.]. Chatgpt-prompts. https:\/\/huggingface.co\/datasets\/MohamedRashad\/ChatGPT-prompts","year":"2023","key":"e_1_3_2_1_11_1","unstructured":"MohamedRashad. [n. d.]. Chatgpt-prompts. https:\/\/huggingface.co\/datasets\/MohamedRashad\/ChatGPT-prompts 2023."},{"key":"e_1_3_2_1_12_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans Ilya Sutskever et al. 2018. Improving language understanding by generative pre-training. (2018)."},{"key":"e_1_3_2_1_13_1","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_2_1_14_1","volume-title":"Accelerating Transformer Inference for Translation via Parallel Decoding. arXiv preprint arXiv:2305.10427","author":"Santilli Andrea","year":"2023","unstructured":"Andrea Santilli, Silvio Severino, Emilian Postolache, Valentino Maiorca, Michele Mancusi, Riccardo Marin, and Emanuele Rodol\u00e0. 2023. Accelerating Transformer Inference for Translation via Parallel Decoding. arXiv preprint arXiv:2305.10427 (2023)."},{"key":"e_1_3_2_1_15_1","volume-title":"Accelerating llm inference with staged speculative decoding. arXiv preprint arXiv:2308.04623","author":"Spector Benjamin","year":"2023","unstructured":"Benjamin Spector and Chris Re. 2023. Accelerating llm inference with staged speculative decoding. arXiv preprint arXiv:2308.04623 (2023)."},{"key":"e_1_3_2_1_16_1","volume-title":"Blockwise parallel decoding for deep autoregressive models. Advances in Neural Information Processing Systems 31","author":"Stern Mitchell","year":"2018","unstructured":"Mitchell Stern, Noam Shazeer, and Jakob Uszkoreit. 2018. Blockwise parallel decoding for deep autoregressive models. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_17_1","volume-title":"Hashimoto","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. 2023. Stanford Alpaca: An Instruction-following LLaMA model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca."},{"key":"e_1_3_2_1_18_1","volume-title":"LLMCad: Fast and Scalable On-device Large Language Model Inference. arXiv preprint arXiv:2309.04255","author":"Xu Daliang","year":"2023","unstructured":"Daliang Xu, Wangsong Yin, Xin Jin, Ying Zhang, Shiyun Wei, Mengwei Xu, and Xuanzhe Liu. 2023. LLMCad: Fast and Scalable On-device Large Language Model Inference. arXiv preprint arXiv:2309.04255 (2023)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/s10115-022-01744-y"},{"key":"e_1_3_2_1_20_1","unstructured":"Lianmin Zheng Wei-Lin Chiang Ying Sheng Siyuan Zhuang Zhanghao Wu Yonghao Zhuang Zi Lin Zhuohan Li Dacheng Li Eric. P Xing Hao Zhang Joseph E. Gonzalez and Ion Stoica. 2023. Judging LLM-as-a-judge with MT-Bench and Chatbot Arena. arXiv:2306.05685 [cs.CL]"}],"event":{"name":"ICCAD '24: 43rd IEEE\/ACM International Conference on Computer-Aided Design","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE CAS","IEEE CEDA","IEEE EDS"],"location":"Newark Liberty International Airport Marriott New York NY USA","acronym":"ICCAD '24"},"container-title":["Proceedings of the 43rd IEEE\/ACM International Conference on Computer-Aided Design"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676695","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676536.3676695","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T23:43:57Z","timestamp":1750290237000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676536.3676695"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,27]]},"references-count":20,"alternative-id":["10.1145\/3676536.3676695","10.1145\/3676536"],"URL":"https:\/\/doi.org\/10.1145\/3676536.3676695","relation":{},"subject":[],"published":{"date-parts":[[2024,10,27]]},"assertion":[{"value":"2025-04-09","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}