{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:40:16Z","timestamp":1755776416085,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":40,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62272026, 62302257"],"award-info":[{"award-number":["62272026, 62302257"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100006374","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2023YFB4503100"],"award-info":[{"award-number":["2023YFB4503100"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3676641.3715986","type":"proceedings-article","created":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T16:47:32Z","timestamp":1743094052000},"page":"178-191","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["CoServe: Efficient Collaboration-of-Experts (CoE) Model Inference with Limited Memory"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5360-353X","authenticated-orcid":false,"given":"Jiashun","family":"Suo","sequence":"first","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7924-9268","authenticated-orcid":false,"given":"Xiaojian","family":"Liao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9438-9181","authenticated-orcid":false,"given":"Limin","family":"Xiao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2386-961X","authenticated-orcid":false,"given":"Li","family":"Ruan","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6690-8386","authenticated-orcid":false,"given":"Jinquan","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5365-2537","authenticated-orcid":false,"given":"Xiao","family":"Su","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5366-0892","authenticated-orcid":false,"given":"Zhisheng","family":"Huo","sequence":"additional","affiliation":[{"name":"State Key Laboratory of CCSE and School of Computer Science and Engineering, Beihang University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2303.08774"},{"key":"e_1_3_2_1_2_1","first-page":"117","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. Taming {Throughput-Latency} tradeoff in {LLM} inference with {Sarathi-Serve}. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pages 117--134, 2024. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/LES.2023.3298738"},{"issue":"70","key":"e_1_3_2_1_4_1","first-page":"1","article-title":"Scaling instruction-finetuned language models","volume":"25","author":"Chung Hyung Won","year":"2024","unstructured":"Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Yunxuan Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, et al. Scaling instruction-finetuned language models. Journal of Machine Learning Research, 25(70):1--53, 2024. https:\/\/www.jmlr.org\/papers\/v25\/23-0870.html.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_5_1","volume-title":"et al. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066","author":"Dai Damai","year":"2024","unstructured":"Damai Dai, Chengqi Deng, Chenggang Zhao, RX Xu, Huazuo Gao, Deli Chen, Jiashi Li,Wangding Zeng, Xingkai Yu, YWu, et al. Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models. arXiv preprint arXiv:2401.06066, 2024. https:\/\/doi.org\/10.48550\/ arXiv.2401.06066."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2020.3034479"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2312.17238"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2310.16795"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508418"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2303.06182"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2407.11686"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00078"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1991.3.1.79"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2401.04088"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2306.02561"},{"key":"e_1_3_2_1_17_1","volume-title":"May","author":"Jocher Glenn","year":"2020","unstructured":"Glenn Jocher. YOLOv5 by Ultralytics, May 2020. https:\/\/github.com\/ultralytics\/yolov5."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2402.07033"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2310.02410"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2024.acl-long.363"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2110.03742"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2001.970573"},{"key":"e_1_3_2_1_24_1","first-page":"945","volume-title":"2023 USENIX Annual Technical Conference (USENIX ATC 23)","author":"Li Jiamin","year":"2023","unstructured":"Jiamin Li, Yimin Jiang, Yibo Zhu, CongWang, and Hong Xu. Accelerating distributed {MoE} training and inference with lina. In 2023 USENIX Annual Technical Conference (USENIX ATC 23), pages 945--959, 2023. https:\/\/www.usenix.org\/conference\/atc23\/presentation\/li-jiamin."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/SIPROCESS.2018.8600456"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2403.19887"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2311.08692"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.knosys.2010.08.004"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO61859.2024.00100"},{"key":"e_1_3_2_1_30_1","volume-title":"Build an ecosystem, not a monolith","author":"Raffel Colin","year":"2023","unstructured":"Colin Raffel. Build an ecosystem, not a monolith, 2023. https:\/\/colinraffel.com\/talks\/simons2023build.pdf."},{"key":"e_1_3_2_1_31_1","first-page":"18332","volume-title":"International conference on machine learning","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. In International conference on machine learning, pages 18332--18346. PMLR, 2022. https:\/\/proceedings.mlr.press\/v162\/rajbhandari22a.html."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2308.12950"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2404.07413"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICHCI51889.2020.00090"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2403.07816"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2412.04167"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS57955.2024.00086"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2308.14352"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651368"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2406.16554"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Rotterdam Netherlands","acronym":"ASPLOS '25"},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715986","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3676641.3715986","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T11:13:04Z","timestamp":1755774784000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3676641.3715986"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":40,"alternative-id":["10.1145\/3676641.3715986","10.1145\/3676641"],"URL":"https:\/\/doi.org\/10.1145\/3676641.3715986","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}