{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,1]],"date-time":"2026-04-01T18:11:04Z","timestamp":1775067064567,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":108,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62432004"],"award-info":[{"award-number":["62432004"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guoqiang Institute, Tsinghua University"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,8,6]]},"DOI":"10.1145\/3676642.3736114","type":"proceedings-article","created":{"date-parts":[[2025,8,6]],"date-time":"2025-08-06T22:19:59Z","timestamp":1754518799000},"page":"147-162","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["<scp>Neuralink:<\/scp>\n            Fast on-Device LLM Inference with Neuron Co-Activation Linking"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8272-8151","authenticated-orcid":false,"given":"Tuowei","family":"Wang","sequence":"first","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3590-7473","authenticated-orcid":false,"given":"Ruwen","family":"Fan","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-8024-4928","authenticated-orcid":false,"given":"Minxing","family":"Huang","sequence":"additional","affiliation":[{"name":"Tianjin University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1671-1367","authenticated-orcid":false,"given":"Zixu","family":"Hao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1013-1325","authenticated-orcid":false,"given":"Kun","family":"Li","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6214-5390","authenticated-orcid":false,"given":"Youyou","family":"Lu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4833-9789","authenticated-orcid":false,"given":"Yaoxue","family":"Zhang","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2782-183X","authenticated-orcid":false,"given":"Ju","family":"Ren","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,8,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Marah Abdin Sam Ade Jacobs Ammar Ahmad Awan Jyoti Aneja Ahmed Awadallah Hany Awadalla Nguyen Bach Amit Bahree Arash Bakhtiari Harkirat Behl et al. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:2404.14219 2024."},{"key":"e_1_3_2_1_2_1","volume-title":"Deep learning using rectified linear units (relu). arXiv preprint arXiv:1803.08375","author":"Agarap AF","year":"2018","unstructured":"AF Agarap. Deep learning using rectified linear units (relu). arXiv preprint arXiv:1803.08375, 2018."},{"key":"e_1_3_2_1_3_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar. Llm in a flash: Efficient large language model inference with limited memory. arXiv preprint arXiv:2312.11514","author":"Alizadeh Keivan","year":"2023","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. Llm in a flash: Efficient large language model inference with limited memory. arXiv preprint arXiv:2312.11514, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"Palm 2 technical report. arXiv preprint arXiv:2305.10403","author":"Anil Rohan","year":"2023","unstructured":"Rohan Anil, Andrew M Dai, Orhan Firat, Melvin Johnson, Dmitry Lepikhin, Alexandre Passos, Siamak Shakeri, Emanuel Taropa, Paige Bailey, Zhifeng Chen, et al. Palm 2 technical report. arXiv preprint arXiv:2305.10403, 2023."},{"key":"e_1_3_2_1_6_1","volume-title":"liburing. https:\/\/github.com\/axboe\/liburing","author":"Axboe Jens","year":"2025","unstructured":"Jens Axboe. liburing. https:\/\/github.com\/axboe\/liburing, 2025. Accessed: 2025-02--25."},{"key":"e_1_3_2_1_7_1","unstructured":"Nathan Bell and Michael Garland. Efficient sparse matrix-vector multiplication on cuda. Technical report Nvidia Technical Report NVR-2008-004 Nvidia Corporation 2008."},{"key":"e_1_3_2_1_8_1","volume-title":"Conditional computation in neural networks for faster models. arXiv preprint arXiv:1511.06297","author":"Bengio Emmanuel","year":"2015","unstructured":"Emmanuel Bengio, Pierre-Luc Bacon, Joelle Pineau, and Doina Precup. Conditional computation in neural networks for faster models. arXiv preprint arXiv:1511.06297, 2015."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6239"},{"key":"e_1_3_2_1_10_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--","author":"Brown Tom","year":"1901","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877-- 1901, 2020."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/1583991.1584053"},{"key":"e_1_3_2_1_12_1","first-page":"36","article-title":"2-bit quantization of large language models with guarantees","author":"Chee Jerry","year":"2024","unstructured":"Jerry Chee, Yaohui Cai, Volodymyr Kuleshov, and Christopher M De Sa. Quip: 2-bit quantization of large language models with guarantees. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","volume-title":"A survey of model compression and acceleration for deep neural networks. arXiv preprint arXiv:1710.09282","author":"Cheng Yu","year":"2017","unstructured":"Yu Cheng, Duo Wang, Pan Zhou, and Tao Zhang. A survey of model compression and acceleration for deep neural networks. arXiv preprint arXiv:1710.09282, 2017."},{"key":"e_1_3_2_1_14_1","volume-title":"Training verifiers to solve mathword problems. arXiv preprint arXiv:2110.14168","author":"Cobbe Karl","year":"2021","unstructured":"Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, Christopher Hesse, and John Schulman. Training verifiers to solve mathword problems. arXiv preprint arXiv:2110.14168, 2021."},{"key":"e_1_3_2_1_15_1","volume-title":"et al. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, RX Xu, Huazuo Gao, Deli Chen, Jiashi Li, Wangding Zeng, Xingkai Yu, Yu Wu, et al. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066, 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"Low-rank approximations for conditional feedforward computation in deep neural networks. arXiv preprint arXiv:1312.4461","author":"Davis Andrew","year":"2013","unstructured":"Andrew Davis and Itamar Arel. Low-rank approximations for conditional feedforward computation in deep neural networks. arXiv preprint arXiv:1312.4461, 2013."},{"key":"e_1_3_2_1_17_1","first-page":"30318","article-title":"8-bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. Gpt3. int8 (): 8-bit matrix multiplication for transformers at scale. Advances in Neural Information Processing Systems, 35:30318--30332, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_18_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_19_1","first-page":"188","volume-title":"Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","volume":"4","author":"Fan Ruwen","year":"2025","unstructured":"Ruwen Fan, Minhui Xie, Haodi Jiang, and Youyou Lu. Maxembed: Maximizing ssd bandwidth utilization for huge embedding models serving. In Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4, ASPLOS '24, page 188--202, New York, NY, USA, 2025. Association for Computing Machinery."},{"issue":"120","key":"e_1_3_2_1_20_1","first-page":"1","article-title":"Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus William","year":"2022","unstructured":"William Fedus, Barret Zoph, and Noam Shazeer. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity. Journal of Machine Learning Research, 23(120):1--39, 2022.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_21_1","volume-title":"https:\/\/github.com\/bitsandbytes-foundation\/bitsandbytes","author":"Foundation BitsandBytes","year":"2025","unstructured":"BitsandBytes Foundation. Bitsandbytes. https:\/\/github.com\/bitsandbytes-foundation\/bitsandbytes, 2025. Accessed: 2025-02--28."},{"key":"e_1_3_2_1_22_1","volume-title":"A reviewof chatgpt applications in education, marketing, software engineering, and healthcare: Benefits, drawbacks, and research directions. arXiv preprint arXiv:2305.00237","author":"Fraiwan Mohammad","year":"2023","unstructured":"Mohammad Fraiwan and Natheer Khasawneh. A reviewof chatgpt applications in education, marketing, software engineering, and healthcare: Benefits, drawbacks, and research directions. arXiv preprint arXiv:2305.00237, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323","author":"Frantar Elias","year":"2022","unstructured":"Elias Frantar, Saleh Ashkboos, Torsten Hoefler, and Dan Alistarh. Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv preprint arXiv:2210.17323, 2022."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00021"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433723"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/364099.364331"},{"key":"e_1_3_2_1_27_1","volume-title":"ggerganov\/llama.cpp: Port of facebook's llama model in c\/c. https:\/\/github.com\/ggerganov\/llama.cpp","author":"Gerganov Georgi","year":"2024","unstructured":"Georgi Gerganov. ggerganov\/llama.cpp: Port of facebook's llama model in c\/c. https:\/\/github.com\/ggerganov\/llama.cpp, 2024."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1201\/9781003162810-13"},{"key":"e_1_3_2_1_29_1","unstructured":"Aaron Gokaslan Vanya Cohen Ellie Pavlick and Stefanie Tellex. Openwebtext corpus. http:\/\/Skylion007.github.io\/OpenWebTextCorpus 2019."},{"key":"e_1_3_2_1_30_1","volume-title":"Dynamic network surgery for efficient dnns. Advances in neural information processing systems, 29","author":"Guo Yiwen","year":"2016","unstructured":"Yiwen Guo, Anbang Yao, and Yurong Chen. Dynamic network surgery for efficient dnns. Advances in neural information processing systems, 29, 2016."},{"key":"e_1_3_2_1_31_1","volume-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149","author":"Han Song","year":"2015","unstructured":"Song Han, Huizi Mao, and William J Dally. Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149, 2015."},{"key":"e_1_3_2_1_32_1","volume-title":"Comparing biases for minimal network construction with back-propagation. Advances in neural information processing systems, 1","author":"Hanson Stephen","year":"1988","unstructured":"Stephen Hanson and Lorien Pratt. Comparing biases for minimal network construction with back-propagation. Advances in neural information processing systems, 1, 1988."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1137\/1024022"},{"key":"e_1_3_2_1_34_1","volume-title":"Chun Jason Xue, and Qingan Li. Chess: Optimizing llm inference via channel-wise thresholding and selective sparsification. arXiv preprint arXiv:2409.01366","author":"He Junhui","year":"2024","unstructured":"Junhui He, ShangyuWu,WeidongWen, Chun Jason Xue, and Qingan Li. Chess: Optimizing llm inference via channel-wise thresholding and selective sparsification. arXiv preprint arXiv:2409.01366, 2024."},{"key":"e_1_3_2_1_35_1","volume-title":"Soft filter pruning for accelerating deep convolutional neural networks. arXiv preprint arXiv:1808.06866","author":"He Yang","year":"2018","unstructured":"Yang He, Guoliang Kang, Xuanyi Dong, Yanwei Fu, and Yi Yang. Soft filter pruning for accelerating deep convolutional neural networks. arXiv preprint arXiv:1808.06866, 2018."},{"key":"e_1_3_2_1_36_1","volume-title":"Proceedings of the International Conference on Learning Representations (ICLR)","author":"Hendrycks Dan","year":"2021","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. Proceedings of the International Conference on Learning Representations (ICLR), 2021."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01270-0_19"},{"issue":"3","key":"e_1_3_2_1_38_1","first-page":"3","article-title":"Phi-2: The surprising power of small language models","volume":"1","author":"Javaheripi Mojan","year":"2023","unstructured":"Mojan Javaheripi, S\u00e9bastien Bubeck, Marah Abdin, Jyoti Aneja, Sebastien Bubeck, Caio C\u00e9sar Teodoro Mendes, Weizhu Chen, Allie Del Giorno, Ronen Eldan, Sivakanth Gopi, et al. Phi-2: The surprising power of small language models. Microsoft Research Blog, 1(3):3, 2023.","journal-title":"Microsoft Research Blog"},{"key":"e_1_3_2_1_39_1","volume-title":"February","author":"JEDEC.","year":"2021","unstructured":"JEDEC. Jedec announces publication of universal flash storage (ufs) standard. https:\/\/www.jedec.org, February 2021. Accessed: 2024--10-02."},{"key":"e_1_3_2_1_40_1","volume-title":"Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825","author":"Jiang Albert Q","year":"2023","unstructured":"Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825, 2023."},{"key":"e_1_3_2_1_41_1","volume-title":"Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. Mixtral of experts. arXiv preprint arXiv:2401.04088","author":"Sablayrolles Alexandre","year":"2024","unstructured":"AlbertQJiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, et al. Mixtral of experts. arXiv preprint arXiv:2401.04088, 2024."},{"key":"e_1_3_2_1_42_1","volume-title":"Scaling laws for neural language models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. Scaling laws for neural language models. arXiv preprint arXiv:2001.08361, 2020."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Tiffany H Kung Morgan Cheatham Arielle Medenilla Czarina Sillos Lorie De Leon Camille Elepa\u00f1o Maria Madriaga Rimel Aggabao Giezel Diaz-Candido James Maningo et al. Performance of chatgpt on usmle: potential for ai-assisted medical education using large language models. PLoS digital health 2(2):e0000198 2023.","DOI":"10.1371\/journal.pdig.0000198"},{"key":"e_1_3_2_1_44_1","first-page":"5533","volume-title":"International Conference on Machine Learning","author":"Kurtz Mark","year":"2020","unstructured":"Mark Kurtz, Justin Kopinsky, Rati Gelashvili, Alexander Matveev, John Carr, Michael Goin, William Leiserson, Sage Moore, Nir Shavit, and Dan Alistarh. Inducing and exploiting activation sparsity for fast inference on deep neural networks. In International Conference on Machine Learning, pages 5533--5543. PMLR, 2020."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.23919\/cje.2022.00.295"},{"key":"e_1_3_2_1_46_1","first-page":"2554","volume-title":"Proceedings of the IEEE conference on computer vision and pattern recognition","author":"Lebedev Vadim","year":"2016","unstructured":"Vadim Lebedev and Victor Lempitsky. Fast convnets using groupwise brain damage. In Proceedings of the IEEE conference on computer vision and pattern recognition, pages 2554--2564, 2016."},{"key":"e_1_3_2_1_47_1","volume-title":"Cats: Contextually-aware thresholding for sparsity in large language models. arXiv preprint arXiv:2404.08763","author":"Lee Donghyun","year":"2024","unstructured":"Donghyun Lee, Je-Yong Lee, Genghan Zhang, Mo Tiwari, and Azalia Mirhoseini. Cats: Contextually-aware thresholding for sparsity in large language models. arXiv preprint arXiv:2404.08763, 2024."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.emnlp-main.563"},{"key":"e_1_3_2_1_49_1","volume-title":"Personal llm agents: Insights and survey about the capability, efficiency and security. arXiv preprint arXiv:2401.05459","author":"Li Yuanchun","year":"2024","unstructured":"Yuanchun Li, Hao Wen, Weijun Wang, Xiangyu Li, Yizhen Yuan, Guohong Liu, Jiacheng Liu, Wenxing Xu, Xiang Wang, Yi Sun, et al. Personal llm agents: Insights and survey about the capability, efficiency and security. arXiv preprint arXiv:2401.05459, 2024."},{"key":"e_1_3_2_1_50_1","volume-title":"Sashank J Reddi, Ke Ye, Felix Chern, Felix Yu, Ruiqi Guo, et al. The lazy neuron phenomenon: On emergence of activation sparsity in transformers. arXiv preprint arXiv:2210.06313","author":"Li Zonglin","year":"2022","unstructured":"Zonglin Li, Chong You, Srinadh Bhojanapalli, Daliang Li, Ankit Singh Rawat, Sashank J Reddi, Ke Ye, Felix Chern, Felix Yu, Ruiqi Guo, et al. The lazy neuron phenomenon: On emergence of activation sparsity in transformers. arXiv preprint arXiv:2210.06313, 2022."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2021.07.045"},{"key":"e_1_3_2_1_52_1","volume-title":"Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434","author":"Liu Aixin","year":"2024","unstructured":"Aixin Liu, Bei Feng, Bin Wang, Bingxuan Wang, Bo Liu, Chenggang Zhao, Chengqi Dengr, Chong Ruan, Damai Dai, Daya Guo, et al. Deepseek-v2: A strong, economical, and efficient mixture-of-experts language model. arXiv preprint arXiv:2405.04434, 2024."},{"key":"e_1_3_2_1_53_1","volume-title":"Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437","author":"Liu Aixin","year":"2024","unstructured":"Aixin Liu, Bei Feng, Bing Xue, BingxuanWang, BochaoWu, Chengda Lu, Chenggang Zhao, Chengqi Deng, Chenyu Zhang, Chong Ruan, et al. Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437, 2024."},{"key":"e_1_3_2_1_54_1","volume-title":"Pruning algorithms to accelerate convolutional neural networks for edge applications: A survey. arXiv preprint arXiv:2005.04275","author":"Liu Jiayi","year":"2020","unstructured":"Jiayi Liu, Samarth Tripathi, Unmesh Kurup, and Mohak Shah. Pruning algorithms to accelerate convolutional neural networks for edge applications: A survey. arXiv preprint arXiv:2005.04275, 2020."},{"key":"e_1_3_2_1_55_1","volume-title":"A contemporary overview: Trends and applications of large language models on mobile devices. arXiv preprint arXiv:2412.03772","author":"Liu Lianjun","year":"2024","unstructured":"Lianjun Liu, Hongli An, Pengxuan Chen, and Longxiang Ye. A contemporary overview: Trends and applications of large language models on mobile devices. arXiv preprint arXiv:2412.03772, 2024."},{"key":"e_1_3_2_1_56_1","first-page":"22137","volume-title":"International Conference on Machine Learning","author":"Liu Zichang","year":"2023","unstructured":"Zichang Liu, Jue Wang, Tri Dao, Tianyi Zhou, Binhang Yuan, Zhao Song, Anshumali Shrivastava, Ce Zhang, Yuandong Tian, Christopher Re, et al. Deja vu: Contextual sparsity for efficient llms at inference time. In International Conference on Machine Learning, pages 22137--22176. PMLR, 2023."},{"key":"e_1_3_2_1_57_1","volume-title":"Saatavissa (viitattu 27.02. 2020): https:\/\/www. techrepublic. com\/blog\/the-enterprise-cloud\/calculate-iops-in-a-storage-array","author":"Lowe Scott","year":"2010","unstructured":"Scott Lowe. Calculate iops in a storage array. TechRepublic, verkkosivu, Saatavissa (viitattu 27.02. 2020): https:\/\/www. techrepublic. com\/blog\/the-enterprise-cloud\/calculate-iops-in-a-storage-array, 2010."},{"key":"e_1_3_2_1_58_1","volume-title":"Sparsing law: Towards large language models with greater activation sparsity. arXiv preprint arXiv:2411.02335","author":"Luo Yuqi","year":"2024","unstructured":"Yuqi Luo, Chenyang Song, Xu Han, Yingfa Chen, Chaojun Xiao, Zhiyuan Liu, and Maosong Sun. Sparsing law: Towards large language models with greater activation sparsity. arXiv preprint arXiv:2411.02335, 2024."},{"key":"e_1_3_2_1_59_1","volume-title":"Pointer sentinel mixture models","author":"Merity Stephen","year":"2016","unstructured":"Stephen Merity, Caiming Xiong, James Bradbury, and Richard Socher. Pointer sentinel mixture models, 2016."},{"key":"e_1_3_2_1_60_1","volume-title":"Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234","author":"Miao Xupeng","year":"2023","unstructured":"Xupeng Miao, Gabriele Oliaro, Zhihao Zhang, Xinhao Cheng, Hongyi Jin, Tianqi Chen, and Zhihao Jia. Towards efficient generative large language model serving: A survey from algorithms to systems. arXiv preprint arXiv:2312.15234, 2023."},{"key":"e_1_3_2_1_61_1","volume-title":"Oncel Tuzel, Golnoosh Samei, Mohammad Rastegari, and Mehrdad Farajtabar. Relu strikes back: Exploiting activation sparsity in large language models. arXiv preprint arXiv:2310.04564","author":"Mirzadeh Iman","year":"2023","unstructured":"Iman Mirzadeh, Keivan Alizadeh, Sachin Mehta, Carlo C Del Mundo, Oncel Tuzel, Golnoosh Samei, Mohammad Rastegari, and Mehrdad Farajtabar. Relu strikes back: Exploiting activation sparsity in large language models. arXiv preprint arXiv:2310.04564, 2023."},{"key":"e_1_3_2_1_62_1","volume-title":"GPU Technology Conference","author":"Naumov Maxim","year":"2010","unstructured":"Maxim Naumov, L Chien, Philippe Vandermersch, and Ujval Kapasi. Cusparse library. In GPU Technology Conference, 2010."},{"key":"e_1_3_2_1_63_1","volume-title":"Accelerating inference with sparsity using the nvidia ampere architecture and nvidia tensorrt","author":"NVIDIA.","year":"2021","unstructured":"NVIDIA. Accelerating inference with sparsity using the nvidia ampere architecture and nvidia tensorrt, 2021. Accessed: 2024--10--18."},{"key":"e_1_3_2_1_64_1","volume-title":"ChatGPT: Get instant answers, find creative inspiration, learn something new. https:\/\/openai.com\/chatgpt","author":"AI.","year":"2022","unstructured":"OpenAI. ChatGPT: Get instant answers, find creative inspiration, learn something new. https:\/\/openai.com\/chatgpt, 2022."},{"key":"e_1_3_2_1_65_1","first-page":"20378","article-title":"Adaptive sparsity by fine-tuning","volume":"33","author":"Sanh Victor","year":"2020","unstructured":"Victor Sanh, Thomas Wolf, and Alexander Rush. Movement pruning: Adaptive sparsity by fine-tuning. Advances in Neural Information Processing Systems, 33:20378--20389, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_66_1","volume-title":"Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150","author":"Shazeer Noam","year":"2019","unstructured":"Noam Shazeer. Fast transformer decoding: One write-head is all you need. arXiv preprint arXiv:1911.02150, 2019."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/230514.571645"},{"key":"e_1_3_2_1_68_1","volume-title":"et al. Prosparse: Introducing and enhancing intrinsic activation sparsity within large language models. arXiv preprint arXiv:2402.13516","author":"Song Chenyang","year":"2024","unstructured":"Chenyang Song, Xu Han, Zhengyan Zhang, Shengding Hu, Xiyu Shi, Kuai Li, Chen Chen, Zhiyuan Liu, Guangli Li, Tao Yang, et al. Prosparse: Introducing and enhancing intrinsic activation sparsity within large language models. arXiv preprint arXiv:2402.13516, 2024."},{"key":"e_1_3_2_1_69_1","volume-title":"Achieving sparse activation in small language models. arXiv preprint arXiv:2406.06562","author":"Song Jifeng","year":"2024","unstructured":"Jifeng Song, Kai Huang, Xiangyu Yin, Boyuan Yang, and Wei Gao. Achieving sparse activation in small language models. arXiv preprint arXiv:2406.06562, 2024."},{"key":"e_1_3_2_1_70_1","volume-title":"Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456","author":"Song Yixin","year":"2023","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456, 2023."},{"key":"e_1_3_2_1_71_1","volume-title":"Turbo sparse: Achieving llm sota performance with minimal activated parameters. arXiv preprint arXiv:2406.05955","author":"Song Yixin","year":"2024","unstructured":"Yixin Song, Haotong Xie, Zhengyan Zhang, Bo Wen, Li Ma, Zeyu Mi, and Haibo Chen. Turbo sparse: Achieving llm sota performance with minimal activated parameters. arXiv preprint arXiv:2406.05955, 2024."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1177\/20965311231168423"},{"key":"e_1_3_2_1_73_1","volume-title":"Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca, 2023."},{"key":"e_1_3_2_1_74_1","volume-title":"Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805","author":"Team Gemini","year":"2023","unstructured":"Gemini Team, Rohan Anil, Sebastian Borgeaud, Yonghui Wu, Jean-Baptiste Alayrac, Jiahui Yu, Radu Soricut, Johan Schalkwyk, Andrew M Dai, Anja Hauth, et al. Gemini: a family of highly capable multimodal models. arXiv preprint arXiv:2312.11805, 2023."},{"key":"e_1_3_2_1_75_1","volume-title":"Gpt-4 technical report","author":"Teams AI","year":"2024","unstructured":"OpenAI Teams. Gpt-4 technical report, 2024."},{"key":"e_1_3_2_1_76_1","volume-title":"Termux app. https:\/\/github.com\/termux\/termux-app","year":"2025","unstructured":"Termux. Termux app. https:\/\/github.com\/termux\/termux-app, 2025. Accessed: 2025-02--25."},{"key":"e_1_3_2_1_77_1","volume-title":"Michael Felsberg, Timothy Baldwin, Eric P. Xing, and Fahad Shahbaz Khan. Mobillama: Towards accurate and lightweight fully transparent gpt","author":"Thawakar Omkar","year":"2024","unstructured":"Omkar Thawakar, Ashmal Vayani, Salman Khan, Hisham Cholakkal, Rao Muhammad Anwer, Michael Felsberg, Timothy Baldwin, Eric P. Xing, and Fahad Shahbaz Khan. Mobillama: Towards accurate and lightweight fully transparent gpt, 2024."},{"key":"e_1_3_2_1_78_1","volume-title":"Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, et al. Llama 2: Open foundation and finetuned chat models. arXiv preprint arXiv:2307.09288, 2023."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/359460.359478"},{"key":"e_1_3_2_1_80_1","volume-title":"Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453","author":"Ma Shuming","year":"2023","unstructured":"HongyuWang, Shuming Ma, Li Dong, Shaohan Huang, Huaijie Wang, Lingxiao Ma, Fan Yang, Ruiping Wang, Yi Wu, and Furu Wei. Bitnet: Scaling 1-bit transformers for large language models. arXiv preprint arXiv:2310.11453, 2023."},{"key":"e_1_3_2_1_81_1","volume-title":"Q-sparse: All large language models can be fully sparsely-activated. arXiv preprint arXiv:2407.10969","author":"Ma Shuming","year":"2024","unstructured":"HongyuWang, Shuming Ma, RuipingWang, and FuruWei. Q-sparse: All large language models can be fully sparsely-activated. arXiv preprint arXiv:2407.10969, 2024."},{"key":"e_1_3_2_1_82_1","volume-title":"Lemo: Enabling less token involvement for more context fine-tuning","author":"Wang Tuowei","year":"2025","unstructured":"Tuowei Wang, Xingyu Chen, Kun Li, Ting Cao, Ju Ren, and Yaoxue Zhang. Lemo: Enabling less token involvement for more context fine-tuning, 2025."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41406.2024.00081"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414654"},{"key":"e_1_3_2_1_85_1","volume-title":"Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. Empowering llm to use smartphone for intelligent task automation. arXiv preprint arXiv:2308.15272","author":"Wen Hao","year":"2023","unstructured":"Hao Wen, Yuanchun Li, Guohong Liu, Shanhui Zhao, Tao Yu, Toby Jia-Jun Li, Shiqi Jiang, Yunhao Liu, Yaqin Zhang, and Yunxin Liu. Empowering llm to use smartphone for intelligent task automation. arXiv preprint arXiv:2308.15272, 2023."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3636534.3649379"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/1362622.1362674"},{"key":"e_1_3_2_1_88_1","volume-title":"Bloomberggpt: A large language model for finance. arXiv preprint arXiv:2303.17564","author":"Wu Shijie","year":"2023","unstructured":"Shijie Wu, Ozan Irsoy, Steven Lu, Vadim Dabravolski, Mark Dredze, Sebastian Gehrmann, Prabhanjan Kambadur, David Rosenberg, and Gideon Mann. Bloomberggpt: A large language model for finance. arXiv preprint arXiv:2303.17564, 2023."},{"key":"e_1_3_2_1_89_1","volume-title":"Flash-llm: Enabling cost-effective and highly-efficient large generative model inference with unstructured sparsity. arXiv preprint arXiv:2309.10285","author":"Xia Haojun","year":"2023","unstructured":"Haojun Xia, Zhen Zheng, Yuchao Li, Donglin Zhuang, Zhongzhu Zhou, Xiafei Qiu, Yong Li, Wei Lin, and Shuaiwen Leon Song. Flash-llm: Enabling cost-effective and highly-efficient large generative model inference with unstructured sparsity. arXiv preprint arXiv:2309.10285, 2023."},{"key":"e_1_3_2_1_90_1","first-page":"38087","volume-title":"International Conference on Machine Learning","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. Smoothquant: Accurate and efficient post-training quantization for large language models. In International Conference on Machine Learning, pages 38087--38099. PMLR, 2023."},{"key":"e_1_3_2_1_91_1","volume-title":"Jinxi Zhao, and Zhibin Xiao. Rethinking network pruning--under the pre-train and fine-tune paradigm. arXiv preprint arXiv:2104.08682","author":"Xu Dongkuan","year":"2021","unstructured":"Dongkuan Xu, Ian EH Yen, Jinxi Zhao, and Zhibin Xiao. Rethinking network pruning--under the pre-train and fine-tune paradigm. arXiv preprint arXiv:2104.08682, 2021."},{"key":"e_1_3_2_1_92_1","volume-title":"Onebit: Towards extremely low-bit large language models. arXiv preprint arXiv:2402.11295","author":"Xu Yuzhuang","year":"2024","unstructured":"Yuzhuang Xu, Xu Han, Zonghan Yang, Shuo Wang, Qingfu Zhu, Zhiyuan Liu, Weidong Liu, and Wanxiang Che. Onebit: Towards extremely low-bit large language models. arXiv preprint arXiv:2402.11295, 2024."},{"key":"e_1_3_2_1_93_1","volume-title":"Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Le Chen, Yubin Xia, and Haibo Chen. Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282, 2024."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613147"},{"key":"e_1_3_2_1_95_1","volume-title":"A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing, page 100211","author":"Yao Yifan","year":"2024","unstructured":"Yifan Yao, Jinhao Duan, Kaidi Xu, Yuanfang Cai, Zhibo Sun, and Yue Zhang. A survey on large language model (llm) security and privacy: The good, the bad, and the ugly. High-Confidence Computing, page 100211, 2024."},{"key":"e_1_3_2_1_96_1","volume-title":"et al. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800","author":"Yao Yuan","year":"2024","unstructured":"Yuan Yao, Tianyu Yu, Ao Zhang, Chongyi Wang, Junbo Cui, Hongji Zhu, Tianchi Cai, Haoyu Li,Weilin Zhao, Zhihui He, et al. Minicpm-v: A gpt-4v level mllm on your phone. arXiv preprint arXiv:2408.01800, 2024."},{"key":"e_1_3_2_1_97_1","first-page":"27168","article-title":"Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. Advances in Neural Information Processing Systems, 35:27168--27183, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_98_1","volume-title":"Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805","author":"Yin Wangsong","year":"2024","unstructured":"Wangsong Yin, Mengwei Xu, Yuanchun Li, and Xuanzhe Liu. Llm as a system service on mobile devices. arXiv preprint arXiv:2403.11805, 2024."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-9868.2005.00532.x"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1009"},{"key":"e_1_3_2_1_101_1","volume-title":"Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830","author":"Zellers Rowan","year":"2019","unstructured":"Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830, 2019."},{"key":"e_1_3_2_1_102_1","volume-title":"Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068, 2022."},{"key":"e_1_3_2_1_103_1","volume-title":"Moefication: Transformer feed-forward layers are mixtures of experts. arXiv preprint arXiv:2110.01786","author":"Zhang Zhengyan","year":"2021","unstructured":"Zhengyan Zhang, Yankai Lin, Zhiyuan Liu, Peng Li, Maosong Sun, and Jie Zhou. Moefication: Transformer feed-forward layers are mixtures of experts. arXiv preprint arXiv:2110.01786, 2021."},{"key":"e_1_3_2_1_104_1","volume-title":"Relu2 wins: Discovering efficient activation functions for sparse llms. arXiv preprint arXiv:2402.03804","author":"Zhang Zhengyan","year":"2024","unstructured":"Zhengyan Zhang, Yixin Song, Guanghui Yu, Xu Han, Yankai Lin, Chaojun Xiao, Chenyang Song, Zhiyuan Liu, Zeyu Mi, and Maosong Sun. Relu2 wins: Discovering efficient activation functions for sparse llms. arXiv preprint arXiv:2402.03804, 2024."},{"key":"e_1_3_2_1_105_1","volume-title":"Emergent modularity in pre-trained transformers. arXiv preprint arXiv:2305.18390","author":"Zhang Zhengyan","year":"2023","unstructured":"Zhengyan Zhang, Zhiyuan Zeng, Yankai Lin, Chaojun Xiao, Xiaozhi Wang, Xu Han, Zhiyuan Liu, Ruobing Xie, Maosong Sun, and Jie Zhou. Emergent modularity in pre-trained transformers. arXiv preprint arXiv:2305.18390, 2023."},{"key":"e_1_3_2_1_106_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613139"},{"key":"e_1_3_2_1_107_1","first-page":"213","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Ningxin","year":"2022","unstructured":"Ningxin Zheng, Bin Lin, Quanlu Zhang, Lingxiao Ma, Yuqing Yang, Fan Yang, Yang Wang, Mao Yang, and Lidong Zhou. SparTA: Deep- Learning Model sparsity via Tensor-with-Sparsity-Attribute. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 213--232, 2022."},{"key":"e_1_3_2_1_108_1","volume-title":"Large language model (llm) for telecommunications: A comprehensive survey on principles, key techniques, and opportunities. arXiv preprint arXiv:2405.10825","author":"Zhou Hao","year":"2024","unstructured":"Hao Zhou, Chengming Hu, Ye Yuan, Yufei Cui, Yili Jin, Can Chen, Haolun Wu, Dun Yuan, Li Jiang, Di Wu, et al. Large language model (llm) for telecommunications: A comprehensive survey on principles, key techniques, and opportunities. arXiv preprint arXiv:2405.10825, 2024."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676642.3736114","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:23:39Z","timestamp":1757543019000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676642.3736114"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,6]]},"references-count":108,"alternative-id":["10.1145\/3676642.3736114","10.1145\/3676642"],"URL":"https:\/\/doi.org\/10.1145\/3676642.3736114","relation":{},"subject":[],"published":{"date-parts":[[2025,8,6]]},"assertion":[{"value":"2025-08-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}