{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T16:52:58Z","timestamp":1774716778255,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":23,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000028","name":"Semiconductor Research Corporation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100000028","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2048183"],"award-info":[{"award-number":["2048183"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3658473","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":39,"title":["EDGE-LLM: Enabling Efficient Large Language Model Adaptation on Edge Devices via Unified Compression and Adaptive Layer Voting"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9981-4981","authenticated-orcid":false,"given":"Zhongzhi","family":"Yu","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-2279-7506","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9233-1947","authenticated-orcid":false,"given":"Yuhan","family":"Li","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-8849-1643","authenticated-orcid":false,"given":"Ruijie","family":"Gao","sequence":"additional","affiliation":[{"name":"Georgoa Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-7668-2561","authenticated-orcid":false,"given":"Xiaoya","family":"Zhou","sequence":"additional","affiliation":[{"name":"University of California, Santa Barbara, Santa Barbara, CA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9756-7924","authenticated-orcid":false,"given":"Sreenidhi Reddy","family":"Bommu","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8023-1551","authenticated-orcid":false,"given":"Yang (Katie)","family":"Zhao","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5946-203X","authenticated-orcid":false,"given":"Yingyan (Celine)","family":"Lin","sequence":"additional","affiliation":[{"name":"Georgia Institute of Technology, Atlanta, GA, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Bubeck et al. 2023. Sparks of artificial general intelligence: Early experiments with gpt-4. arXiv preprint arXiv:2303.12712 (2023)."},{"key":"e_1_3_2_1_2_1","volume-title":"Qlora: Efficient finetuning of quantized llms. arXiv","author":"Dettmers","year":"2023","unstructured":"Dettmers et al. 2023. Qlora: Efficient finetuning of quantized llms. arXiv (2023)."},{"key":"e_1_3_2_1_3_1","first-page":"36","article-title":"2024. Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization","author":"Kim","year":"2024","unstructured":"Kim et al. 2024. Memory-efficient fine-tuning of compressed large language models via sub-4-bit integer quantization. NeurIPS 36 (2024).","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_4_1","unstructured":"Yu et al. 2022. Unified visual transformer compression. arXiv preprint arXiv:2203.08243 (2022)."},{"key":"e_1_3_2_1_5_1","volume-title":"Hint-aug: Drawing hints from foundation vision transformers towards boosted few-shot parameter-efficient tuning. In CVPR. 11102--11112.","author":"Yu","year":"2023","unstructured":"Yu et al. 2023. Hint-aug: Drawing hints from foundation vision transformers towards boosted few-shot parameter-efficient tuning. In CVPR. 11102--11112."},{"key":"e_1_3_2_1_6_1","unstructured":"Yu et al. 2023. Master-ASR: achieving multilingual scalability and low-resource adaptation in ASR with modular learning. In ICML. PMLR 40475--40487."},{"key":"e_1_3_2_1_7_1","unstructured":"Frantar et al. 2023. SparseGPT: Massive Language Models Can Be Accurately Pruned in One-Shot. (2023)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Fu et al. 2021. Enabling random precision switch for winning both adversarial robustness and efficiency. In MICRO. 225--237.","DOI":"10.1145\/3466752.3480082"},{"key":"e_1_3_2_1_9_1","unstructured":"Hendrycks et al. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_10_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu","year":"2021","unstructured":"Hu et al. 2021. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685 (2021)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Liu et al. 2023. LLM-QAT: Data-Free Quantization Aware Training for Large Language Models. arXiv (2023).","DOI":"10.18653\/v1\/2024.findings-acl.26"},{"key":"e_1_3_2_1_12_1","unstructured":"Merity et al. 2016. Pointer sentinel mixture models. arXiv preprint arXiv:1609.07843 (2016)."},{"key":"e_1_3_2_1_13_1","unstructured":"Meta. 2022. Quest Pro. https:\/\/www.meta.com\/quest\/quest-pro\/."},{"key":"e_1_3_2_1_14_1","unstructured":"NVIDIA. 2020. NVIDIA Jetson TX2. www.nvidia.com\/en-us\/autonomous-machines\/embedded-systems\/jetson-tx2\/."},{"key":"e_1_3_2_1_15_1","unstructured":"Pearce et al. 2021. Understanding softmax confidence and uncertainty. arXiv preprint arXiv:2106.04972 (2021)."},{"key":"e_1_3_2_1_16_1","unstructured":"Samajdar et al. 2023. Systolic CNN AcceLErator Simulator (SCALE Sim). https:\/\/github.com\/ARM-software\/SCALE-Sim."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Shao et al. 2023. An Efficient Training Accelerator for Transformers With Hardware-Algorithm Co-Optimization. VLSI (2023).","DOI":"10.1109\/TVLSI.2023.3305569"},{"key":"e_1_3_2_1_18_1","unstructured":"Sheng et al. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. (2023)."},{"key":"e_1_3_2_1_19_1","first-page":"12991","article-title":"2022. Lst: Ladder side-tuning for parameter and memory efficient transfer learning","volume":"35","author":"Sung","year":"2022","unstructured":"Sung et al. 2022. Lst: Ladder side-tuning for parameter and memory efficient transfer learning. NeurIPS 35 (2022), 12991--13005.","journal-title":"NeurIPS"},{"key":"e_1_3_2_1_20_1","unstructured":"Taori et al. 2023. Stanford alpaca: An instruction-following llama model."},{"key":"e_1_3_2_1_21_1","volume-title":"Branchynet: Fast inference via early exiting from deep neural networks. In ICPR.","author":"Teerapittayanon","year":"2016","unstructured":"Teerapittayanon et al. 2016. Branchynet: Fast inference via early exiting from deep neural networks. In ICPR."},{"key":"e_1_3_2_1_22_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Touvron","year":"2023","unstructured":"Touvron et al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)."},{"key":"e_1_3_2_1_23_1","volume-title":"Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv","author":"Zhang","year":"2023","unstructured":"Zhang et al. 2023. Llama-adapter: Efficient fine-tuning of language models with zero-init attention. arXiv (2023)."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","location":"San Francisco CA USA","acronym":"DAC '24","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658473","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3658473","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3658473","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:01Z","timestamp":1750295881000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658473"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":23,"alternative-id":["10.1145\/3649329.3658473","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3658473","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}