{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:05:27Z","timestamp":1775815527468,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T00:00:00Z","timestamp":1745280000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,4,28]]},"DOI":"10.1145\/3696410.3714553","type":"proceedings-article","created":{"date-parts":[[2025,4,22]],"date-time":"2025-04-22T23:08:29Z","timestamp":1745363309000},"page":"4264-4273","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["WeInfer: Unleashing the Power of WebGPU on LLM Inference in Web Browsers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8607-8539","authenticated-orcid":false,"given":"Zhiyang","family":"Chen","sequence":"first","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7866-4075","authenticated-orcid":false,"given":"Yun","family":"Ma","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4599-3198","authenticated-orcid":false,"given":"Haiyang","family":"Shen","sequence":"additional","affiliation":[{"name":"Institute for Artificial Intelligence, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-7625-8721","authenticated-orcid":false,"given":"Mugeng","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Peking University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,4,22]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Mohammad Rastegari, and Mehrdad Farajtabar.","author":"Alizadeh Keivan","year":"2024","unstructured":"Keivan Alizadeh, Iman Mirzadeh, Dmitry Belenko, Karen Khatamifard, Minsik Cho, Carlo C. Del Mundo, Mohammad Rastegari, and Mehrdad Farajtabar. 2024. LLM in a Flash: Efficient Large Language Model Inference with Limited Memory."},{"key":"e_1_3_2_1_2_1","unstructured":"Loubna Ben Allal Anton Lozhkov Elie Bakouch Leandro von Werra and Thomas Wolf. 2024. SmolLM - blazingly fast and remarkably powerful."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_4_1","unstructured":"Apple. 2023. Metal. https:\/\/developer.apple.com\/metal\/"},{"key":"e_1_3_2_1_5_1","volume-title":"The Security Architecture of the Chromium Browser. Technical report","author":"Barth Adam","year":"2008","unstructured":"Adam Barth, Charles Reis, and Collin Jackson. 2008. The Security Architecture of the Chromium Browser. Technical report (2008)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645395"},{"key":"e_1_3_2_1_7_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language Models Are Few-Shot Learners."},{"key":"e_1_3_2_1_8_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 2018). 578--594."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3004509"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1038\/s43856-023-00370-1"},{"key":"e_1_3_2_1_11_1","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et al. 2024. The llama 3 herd of models. arXiv preprint arXiv:2407.21783 (2024)."},{"key":"e_1_3_2_1_12_1","unstructured":"Hugging face. 2023. Transformers.js. https:\/\/huggingface.co\/docs\/transformers.js\/index"},{"key":"e_1_3_2_1_13_1","unstructured":"Google. 2018. Process model of chrome. https:\/\/developer.chrome.com\/blog\/inside-browser-part1"},{"key":"e_1_3_2_1_14_1","unstructured":"Google. 2023a. TensorFlow.js. https:\/\/www.tensorflow.org\/js"},{"key":"e_1_3_2_1_15_1","unstructured":"Google. 2023b. What's New in WebGPU (Chrome 113). https:\/\/developer.chrome.com\/blog\/new-in-webgpu-113"},{"key":"e_1_3_2_1_16_1","unstructured":"Google. 2024a. MediaPipe LLM. https:\/\/ai.google.dev\/edge\/mediapipe\/solutions\/genai\/llm_inference"},{"key":"e_1_3_2_1_17_1","unstructured":"Google. 2024b. Offcial Demo of MediaPipe LLM. https:\/\/mediapipe-studio.webapps.google.com\/demo\/llm_inference"},{"key":"e_1_3_2_1_18_1","unstructured":"Google. 2024c. WebAssembly and WebGPU enhancements for faster Web AI. https:\/\/io.google\/2024\/explore\/4148a1ac-c3a5--43a9--8a3d-f9c2358282e9"},{"key":"e_1_3_2_1_19_1","unstructured":"Google. 2024 d. What's New in WebGPU (Chrome 121). https:\/\/developer.chrome.com\/blog\/new-in-webgpu-121"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/1978248.1978369"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/SP.2008.19"},{"key":"e_1_3_2_1_22_1","unstructured":"W3C Community Group. 2024. WebGPU Explainer. https:\/\/gpuweb.github.io\/gpuweb\/explainer\/#background"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3418297"},{"key":"e_1_3_2_1_24_1","unstructured":"Daya Guo Qihao Zhu Dejian Yang Zhenda Xie Kai Dong Wentao Zhang Guanting Chen Xiao Bi Yu Wu YK Li et al. 2024. DeepSeek-Coder: When the Large Language Model Meets Programming--The Rise of Code Intelligence. arXiv preprint arXiv:2401.14196 (2024)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651380"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3129394"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3643832.3661892"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3604237.3626869"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3610856"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3308558.3313639"},{"key":"e_1_3_2_1_31_1","unstructured":"mdn. 2024. Browser Compatibility of WebGPU. https:\/\/developer.mozilla.org\/en-US\/docs\/Web\/API\/WebGPU_API#browser_compatibility"},{"key":"e_1_3_2_1_32_1","unstructured":"Microsoft. 2023. ONNX Runtime Web. https:\/\/onnxruntime.ai\/docs\/tutorials\/web\/"},{"key":"e_1_3_2_1_33_1","unstructured":"mlc ai. 2023. WebLLM. https:\/\/webllm.mlc.ai"},{"key":"e_1_3_2_1_34_1","unstructured":"mlc ai. 2024. WebLLM Chat. https:\/\/chat.webllm.ai\/"},{"key":"e_1_3_2_1_35_1","unstructured":"Mozilla. 2019. Process model of firefox. https:\/\/wiki.mozilla.org\/Security\/Sandbox\/Process_model"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378534"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/IS.2018.8710545"},{"key":"e_1_3_2_1_39_1","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently Scaling Transformer Inference. In Proceedings of Machine Learning and Systems (2023), Vol. 5. 606--624."},{"key":"e_1_3_2_1_40_1","volume-title":"Unlocking the Potential of Health Data with Decentralised Search in Personal Health Datastores. In Companion Proceedings of the ACM Web Conference 2024 (WWW 24)","author":"Ragab Mohamed","year":"2024","unstructured":"Mohamed Ragab, Yury Savateev, Helen Oliver, Thanassis Tiropanis, Alexandra Poulovassilis, Adriane Chapman, and George Roussos. 2024. Unlocking the Potential of Health Data with Decentralised Search in Personal Health Datastores. In Companion Proceedings of the ACM Web Conference 2024 (WWW 24). 1154--1157."},{"key":"e_1_3_2_1_41_1","unstructured":"Charlie F Ruan Yucheng Qin Xun Zhou Ruihang Lai Hongyi Jin Yixin Dong Bohan Hou Meng-Shiun Yu Yiyan Zhai Sudeep Agarwal et al. 2024. WebLLM: A High-Performance In-Browser LLM Inference Engine. arXiv preprint arXiv:2412.15803 (2024)."},{"key":"e_1_3_2_1_42_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning (ICML","volume":"202","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher Re, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In Proceedings of the 40th International Conference on Machine Learning (ICML 2023), Vol. 202. 31094--31116."},{"key":"e_1_3_2_1_43_1","unstructured":"Ruslan Svirschevski Avner May Zhuoming Chen Beidi Chen Zhihao Jia and Max Ryabinin. 2024. SpecExec: Massively Parallel Speculative Decoding for Interactive LLM Inference on Consumer Devices."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11704-024-40066-w"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538763"},{"key":"e_1_3_2_1_46_1","unstructured":"European Union. 2021. General data protection regulation. https:\/\/gdpr-info.eu\/"},{"key":"e_1_3_2_1_47_1","volume-title":"SocialGenPod: Privacy-Friendly Generative AI Social Web Applications with Decentralised Personal Data Stores. In Companion Proceedings of the ACM on Web Conference 2024 (WWW","author":"Vizgirda Vidminas","year":"2024","unstructured":"Vidminas Vizgirda, Rui Zhao, and Naman Goel. 2024. SocialGenPod: Privacy-Friendly Generative AI Social Web Applications with Decentralised Personal Data Stores. In Companion Proceedings of the ACM on Web Conference 2024 (WWW 2024). 1067--1070."},{"key":"e_1_3_2_1_48_1","unstructured":"W3C. 2023. WebGPU. https:\/\/www.w3.org\/TR\/webgpu\/"},{"key":"e_1_3_2_1_49_1","unstructured":"W3C. 2024. Timeline model of WebGPU. https:\/\/gpuweb.github.io\/gpuweb\/#programming-model-timelines"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587438"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/OJCS.2023.3300321"},{"key":"e_1_3_2_1_52_1","unstructured":"Daliang Xu Hao Zhang Liming Yang Ruiqi Liu Gang Huang Mengwei Xu and Xuanzhe Liu. 2024. Empowering 1000 Tokens\/Second on-Device LLM Prefilling with Mllm-NPU."},{"key":"e_1_3_2_1_53_1","unstructured":"Zhenliang Xue Yixin Song Zeyu Mi Le Chen Yubin Xia and Haibo Chen. 2024. PowerInfer-2: Fast Large Language Model Inference on a Smartphone."},{"key":"e_1_3_2_1_54_1","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et al. 2024. Qwen2 Technical Report."},{"key":"e_1_3_2_1_55_1","first-page":"27168","article-title":"ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. 2022. ZeroQuant: Efficient and Affordable Post-Training Quantization for Large-Scale Transformers. In Advances in Neural Information Processing Systems, Vol. 35. 27168--27183.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_56_1","volume-title":"Galaxy: A Resource-Efficient Collaborative Edge AI System for In-situ Transformer Inference. arxiv: 2405.17245","author":"Ye Shengyuan","year":"2024","unstructured":"Shengyuan Ye, Jiangsu Du, Liekang Zeng, Wenzhong Ou, Xiaowen Chu, Yutong Lu, and Xu Chen. 2024. Galaxy: A Resource-Efficient Collaborative Edge AI System for In-situ Transformer Inference. arxiv: 2405.17245"},{"key":"e_1_3_2_1_57_1","unstructured":"Rongjie Yi Liwei Guo Shiyun Wei Ao Zhou Shangguang Wang and Mengwei Xu. 2023. EdgeMoE: Fast On-Device Inference of MoE-Based Large Language Models."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872518.2889366"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-demos.16"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"}],"event":{"name":"WWW '25: The ACM Web Conference 2025","location":"Sydney NSW Australia","acronym":"WWW '25","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM on Web Conference 2025"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714553","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3696410.3714553","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:33Z","timestamp":1750295913000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3696410.3714553"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,22]]},"references-count":59,"alternative-id":["10.1145\/3696410.3714553","10.1145\/3696410"],"URL":"https:\/\/doi.org\/10.1145\/3696410.3714553","relation":{},"subject":[],"published":{"date-parts":[[2025,4,22]]},"assertion":[{"value":"2025-04-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}