{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:45:13Z","timestamp":1772725513399,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62372287,61925206"],"award-info":[{"award-number":["62372287,61925206"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2237295"],"award-info":[{"award-number":["2237295"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707224","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"843-857","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["PipeLLM: Fast and Confidential Large Language Model Services with Speculative Pipelined Encryption"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0008-6329-7274","authenticated-orcid":false,"given":"Yifan","family":"Tan","sequence":"first","affiliation":[{"name":"Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1420-5125","authenticated-orcid":false,"given":"Cheng","family":"Tan","sequence":"additional","affiliation":[{"name":"Northeastern University, Boston, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8395-1319","authenticated-orcid":false,"given":"Zeyu","family":"Mi","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9720-0361","authenticated-orcid":false,"given":"Haibo","family":"Chen","sequence":"additional","affiliation":[{"name":"Institute of Parallel and Distributed Systems, SEIEE, Shanghai Jiao Tong University, Shanghai, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/www.amd.com\/system\/files\/TechDocs\/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf","author":"Register State AMD.","year":"2017","unstructured":"AMD. Protecting VM Register State With SEV-ES. https:\/\/www.amd.com\/system\/files\/TechDocs\/Protecting%20VM%20Register%20State%20with%20SEV-ES.pdf, 2017."},{"key":"e_1_3_2_1_2_1","volume-title":"AMD SEV-SNP: Strengthening VM Isolation with Integrity Protection and More. https:\/\/www.amd.com\/system\/files\/TechDocs\/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf","author":"AMD.","year":"2020","unstructured":"AMD. AMD SEV-SNP: Strengthening VM Isolation with Integrity Protection and More. https:\/\/www.amd.com\/system\/files\/TechDocs\/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf, 2020."},{"key":"e_1_3_2_1_3_1","volume-title":"ARM Confidential Compute Architecture. https:\/\/www.arm.com\/architecture\/security-features\/arm-confidential-compute-architecture","author":"ARM.","year":"2023","unstructured":"ARM. ARM Confidential Compute Architecture. https:\/\/www.arm.com\/architecture\/security-features\/arm-confidential-compute-architecture, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"Announcing Azure confidential VMs with NVIDIA H100 Tensor Core GPUs in Preview. https:\/\/aka.ms\/cvm-h100-preview","author":"Azure Microsoft","year":"2023","unstructured":"Microsoft Azure. Announcing Azure confidential VMs with NVIDIA H100 Tensor Core GPUs in Preview. https:\/\/aka.ms\/cvm-h100-preview, 2023."},{"key":"e_1_3_2_1_5_1","volume-title":"Azure confidential computing. https:\/\/azure.microsoft.com\/en-us\/solutions\/confidential-compute\/","author":"Azure Microsoft","year":"2024","unstructured":"Microsoft Azure. Azure confidential computing. https:\/\/azure.microsoft.com\/en-us\/solutions\/confidential-compute\/, 2024."},{"key":"e_1_3_2_1_6_1","first-page":"267","volume-title":"Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Baumann Andrew","year":"2014","unstructured":"Andrew Baumann, Marcus Peinado, and Galen Hunt. Shielding Applications from an Untrusted Cloud with Haven. In Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14), pages 267--283, Broomfield, CO, October 2014. USENIX Association."},{"key":"e_1_3_2_1_7_1","first-page":"1877","volume-title":"Advances in Neural Information Processing Systems","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners. In H. Larochelle, M. Ranzato, R. Hadsell, M.F. Balcan, and H. Lin, editors, Advances in Neural Information Processing Systems, volume 33, pages 1877--1901. Curran Associates, Inc., 2020."},{"key":"e_1_3_2_1_8_1","first-page":"209","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Chen Jiahao","year":"2023","unstructured":"Jiahao Chen, Dingji Li, Zeyu Mi, Yuxuan Liu, Binyu Zang, Haibing Guan, and Haibo Chen. Security and performance in the delegated user-level virtualization. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23), pages 209--226, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_9_1","first-page":"1065","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Chen Jiahao","year":"2024","unstructured":"Jiahao Chen, Zeyu Mi, Yubin Xia, Haibing Guan, and Haibo Chen. CPC: Flexible, secure, and efficient CVM maintenance with confidential procedure calls. In 2024 USENIX Annual Technical Conference (USENIX ATC 24), pages 1065--1082, Santa Clara, CA, July 2024. USENIX Association."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/cloud.google.com\/blog\/products\/identity-security\/rsa-snp-vm-more-confidential","author":"Cloud Google","year":"2024","unstructured":"Google Cloud. Oh SNP! VMs get even more confidential. https:\/\/cloud.google.com\/blog\/products\/identity-security\/rsa-snp-vm-more-confidential, 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691, 2023."},{"key":"e_1_3_2_1_12_1","first-page":"16344","article-title":"Flashattention: Fast and memory-efficient exact attention with ioawareness","volume":"35","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. Flashattention: Fast and memory-efficient exact attention with ioawareness. Advances in Neural Information Processing Systems, 35:16344--16359, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3548606.3560627"},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_15_1","volume-title":"Creating the first confidential gpus. Commun. ACM, dec","author":"Dhanuskodi Gobikrishna","year":"2023","unstructured":"Gobikrishna Dhanuskodi, Sudeshna Guha, Vidhya Krishnan, Aruna Manjunatha, Rob Nertney, Michael O'Connor, and Phil Rogers. Creating the first confidential gpus. Commun. ACM, dec 2023."},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Learning Representations","author":"Hu Edward J","year":"2022","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, SheanWang, LuWang, andWeizhu Chen. LoRA: Low-rank adaptation of large language models. In International Conference on Learning Representations, 2022."},{"key":"e_1_3_2_1_17_1","volume-title":"Intel TDX\u00ae Module v1.5 Base Architecture Specification. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/intel-tdx-module-1.5-base-spec-348549001.pdf","year":"2022","unstructured":"Intel. Intel TDX\u00ae Module v1.5 Base Architecture Specification. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/intel-tdx-module-1.5-base-spec-348549001.pdf, 2022."},{"key":"e_1_3_2_1_18_1","volume-title":"Intel\u00ae Trust Domain Extension (Intel\u00ae TDX) Module. https:\/\/www.intel.com\/content\/www\/us\/en\/download\/738875\/intel-trust-domain-extension-intel-tdx-module.html","year":"2022","unstructured":"Intel. Intel\u00ae Trust Domain Extension (Intel\u00ae TDX) Module. https:\/\/www.intel.com\/content\/www\/us\/en\/download\/738875\/intel-trust-domain-extension-intel-tdx-module.html, 2022."},{"key":"e_1_3_2_1_19_1","volume-title":"Intel\u00ae Trust Domain Extensions. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/tdx-whitepaper-v4.pdf","year":"2022","unstructured":"Intel. Intel\u00ae Trust Domain Extensions. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/tdx-whitepaper-v4.pdf, 2022."},{"key":"e_1_3_2_1_20_1","volume-title":"Intel software developer's manual. https:\/\/cdrdv2-public.intel.com\/812392\/325462-sdm-vol-1-2abcd-3abcd-4.pdf","year":"2023","unstructured":"Intel. Intel software developer's manual. https:\/\/cdrdv2-public.intel.com\/812392\/325462-sdm-vol-1-2abcd-3abcd-4.pdf, 2023."},{"key":"e_1_3_2_1_21_1","volume-title":"Intel\u00ae TDX Connect Architecture Specification. https:\/\/cdrdv2.intel.com\/v1\/dl\/getContent\/773614","year":"2023","unstructured":"Intel. Intel\u00ae TDX Connect Architecture Specification. https:\/\/cdrdv2.intel.com\/v1\/dl\/getContent\/773614, 2023."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304021"},{"key":"e_1_3_2_1_23_1","unstructured":"Albert Q. Jiang Alexandre Sablayrolles Antoine Roux Arthur Mensch Blanche Savary Chris Bamford Devendra Singh Chaplot Diego de las Casas Emma Bou Hanna Florian Bressand Gianna Lengyel Guillaume Bour Guillaume Lample L\u00e9lio Renard Lavaud Lucile Saulnier Marie-Anne Lachaux Pierre Stock Sandeep Subramanian Sophia Yang Szymon Antoniak Teven Le Scao Th\u00e9ophile Gervet Thibaut Lavril ThomasWang Timoth\u00e9e Lacroix and William El Sayed. Mixtral of experts 2024."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3624576"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2014.9"},{"key":"e_1_3_2_1_27_1","first-page":"1","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Dingji","year":"2023","unstructured":"Dingji Li, Zeyu Mi, Chenhui Ji, Yifan Tan, Binyu Zang, Haibing Guan, and Haibo Chen. Bifrost: Analysis and optimization of network I\/O tax in confidential virtual machines. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 1--15, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_28_1","first-page":"638","volume-title":"Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles, SOSP '21","author":"Li Dingji","year":"2021","unstructured":"Dingji Li, Zeyu Mi, Yubin Xia, Binyu Zang, Haibo Chen, and Haibing Guan. Twinvisor: Hardware-isolated confidential virtual machines for arm. In Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles, SOSP '21, page 638--654, New York, NY, USA, 2021. Association for Computing Machinery."},{"key":"e_1_3_2_1_29_1","first-page":"155","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Mai HaoHui","year":"2023","unstructured":"HaoHui Mai, Jiacheng Zhao, Hongren Zheng, Yiyang Zhao, Zibin Liu, Mingyu Gao, Cong Wang, Huimin Cui, Xiaobing Feng, and Christos Kozyrakis. Honeycomb: Secure and efficient GPU executions via static validation. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23), pages 155--172, Boston, MA, July 2023. USENIX Association."},{"key":"e_1_3_2_1_30_1","volume-title":"ultrachat-10k-chatml. https:\/\/huggingface.co\/datasets\/smangrul\/ultrachat-10k-chatml","author":"Mangrulkar Sourab","year":"2024","unstructured":"Sourab Mangrulkar. ultrachat-10k-chatml. https:\/\/huggingface.co\/datasets\/smangrul\/ultrachat-10k-chatml, 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Peft: State-of-the-art parameterefficient fine-tuning methods. https:\/\/github.com\/huggingface\/peft","author":"Mangrulkar Sourab","year":"2022","unstructured":"Sourab Mangrulkar, Sylvain Gugger, Lysandre Debut, Younes Belkada, Sayak Paul, and Benjamin Bossan. Peft: State-of-the-art parameterefficient fine-tuning methods. https:\/\/github.com\/huggingface\/peft, 2022."},{"key":"e_1_3_2_1_32_1","first-page":"1695","volume-title":"29th USENIX Security Symposium (USENIX Security 20)","author":"Mi Zeyu","year":"2020","unstructured":"Zeyu Mi, Dingji Li, Haibo Chen, Binyu Zang, and Haibing Guan. (mostly) exitless VM protection from untrusted hypervisor through disaggregated nested virtualization. In 29th USENIX Security Symposium (USENIX Security 20), pages 1695--1712. USENIX Association, August 2020."},{"key":"e_1_3_2_1_33_1","volume-title":"Microsoft Copilot: Your everyday AI companion. https:\/\/copilot.microsoft.com\/","year":"2024","unstructured":"Microsoft. Microsoft Copilot: Your everyday AI companion. https:\/\/copilot.microsoft.com\/, 2024."},{"key":"e_1_3_2_1_34_1","volume-title":"Benefits of NVIDIA Hopper H100 Confidential Computing for trustworthy AI. https:\/\/developer.nvidia.com\/blog\/confidential-computing-on-h100-gpus-for-secure-and-trustworthy-ai\/","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. Benefits of NVIDIA Hopper H100 Confidential Computing for trustworthy AI. https:\/\/developer.nvidia.com\/blog\/confidential-computing-on-h100-gpus-for-secure-and-trustworthy-ai\/, 2023."},{"key":"e_1_3_2_1_35_1","volume-title":"Confidential Compute on NVIDIA Hopper H100. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/HCC-Whitepaper-v1.0.pdf","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. Confidential Compute on NVIDIA Hopper H100. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/HCC-Whitepaper-v1.0.pdf, 2023."},{"key":"e_1_3_2_1_36_1","volume-title":"NVIDIA Confidential Computing. https:\/\/www.nvidia.com\/en-us\/data-center\/solutions\/confidential-computing\/","author":"NVIDIA.","year":"2024","unstructured":"NVIDIA. NVIDIA Confidential Computing. https:\/\/www.nvidia.com\/en-us\/data-center\/solutions\/confidential-computing\/, 2024."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2801153"},{"key":"e_1_3_2_1_38_1","volume-title":"Efficiently scaling transformer inference","author":"Pope Reiner","year":"2022","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Anselm Levskaya, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. Efficiently scaling transformer inference, 2022."},{"key":"e_1_3_2_1_39_1","first-page":"551","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. ZeRO-Offload: Democratizing Billion-Scale model training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21), pages 551--564. USENIX Association, July 2021."},{"key":"e_1_3_2_1_40_1","first-page":"723","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Shen Tianxiang","year":"2022","unstructured":"Tianxiang Shen, Ji Qi, Jianyu Jiang, Xian Wang, Siyuan Wen, Xusheng Chen, Shixiong Zhao, Sen Wang, Li Chen, Xiapu Luo, Fengwei Zhang, and Heming Cui. SOTER: Guarding black-box inference for general neural networks at the edge. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 723--738, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_41_1","first-page":"296","article-title":"Slora: Scalable serving of thousands of lora adapters","volume":"6","author":"Sheng Ying","year":"2024","unstructured":"Ying Sheng, Shiyi Cao, Dacheng Li, Coleman Hooper, Nicholas Lee, Shuo Yang, Christopher Chou, Banghua Zhu, Lianmin Zheng, Kurt Keutzer, et al. Slora: Scalable serving of thousands of lora adapters. Proceedings of Machine Learning and Systems, 6:296--311, 2024.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning, ICML'23. JMLR.org","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. Flexgen: High-throughput generative inference of large language models with a single gpu. In Proceedings of the 40th International Conference on Machine Learning, ICML'23. JMLR.org, 2023."},{"key":"e_1_3_2_1_43_1","volume-title":"Megatron-lm: Training multibillion parameter language models using model parallelism","author":"Shoeybi Mohammad","year":"2020","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multibillion parameter language models using model parallelism, 2020."},{"key":"e_1_3_2_1_44_1","volume-title":"Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456","author":"Song Yixin","year":"2023","unstructured":"Yixin Song, Zeyu Mi, Haotong Xie, and Haibo Chen. Powerinfer: Fast large language model serving with a consumer-grade gpu. arXiv preprint arXiv:2312.12456, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"Turbo sparse: Achieving llm sota performance with minimal activated parameters. arXiv preprint arXiv:2406.05955","author":"Song Yixin","year":"2024","unstructured":"Yixin Song, Haotong Xie, Zhengyan Zhang, Bo Wen, Li Ma, Zeyu Mi, and Haibo Chen. Turbo sparse: Achieving llm sota performance with minimal activated parameters. arXiv preprint arXiv:2406.05955, 2024."},{"key":"e_1_3_2_1_46_1","volume-title":"Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca","author":"Taori Rohan","year":"2023","unstructured":"Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li, Carlos Guestrin, Percy Liang, and Tatsunori B. Hashimoto. Stanford alpaca: An instruction-following llama model. https:\/\/github.com\/tatsu-lab\/stanford_alpaca, 2023."},{"key":"e_1_3_2_1_47_1","volume-title":"https:\/\/sharegpt.com","author":"Team GPT","year":"2023","unstructured":"ShareGPT Team. https:\/\/sharegpt.com, 2023."},{"key":"e_1_3_2_1_48_1","volume-title":"Llama: Open and efficient foundation language models","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. Llama: Open and efficient foundation language models, 2023."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_50_1","first-page":"681","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Volos Stavros","year":"2018","unstructured":"Stavros Volos, Kapil Vaswani, and Rodrigo Bruno. Graviton: Trusted execution environments on GPUs. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pages 681--696, Carlsbad, CA, October 2018. USENIX Association."},{"key":"e_1_3_2_1_51_1","volume-title":"Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282","author":"Xue Zhenliang","year":"2024","unstructured":"Zhenliang Xue, Yixin Song, Zeyu Mi, Le Chen, Yubin Xia, and Haibo Chen. Powerinfer-2: Fast large language model inference on a smartphone. arXiv preprint arXiv:2406.06282, 2024."},{"key":"e_1_3_2_1_52_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. Opt: Open pre-trained transformer language models","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. Opt: Open pre-trained transformer language models, 2022."},{"key":"e_1_3_2_1_53_1","volume-title":"Judging llm-as-a-judge with mt-bench and chatbot arena","author":"Zheng Lianmin","year":"2023","unstructured":"Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, ZhanghaoWu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric P. Xing, Hao Zhang, Joseph E. Gonzalez, and Ion Stoica. Judging llm-as-a-judge with mt-bench and chatbot arena, 2023."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP40000.2020.00054"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038228.3038233"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707224","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707224","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:47:15Z","timestamp":1755787635000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707224"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":55,"alternative-id":["10.1145\/3669940.3707224","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707224","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}