{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,14]],"date-time":"2026-02-14T10:23:11Z","timestamp":1771064591108,"version":"3.50.1"},"reference-count":39,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"6","license":[{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,12,1]],"date-time":"2024-12-01T00:00:00Z","timestamp":1733011200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Shanghai Key Laboratory of Scalable Computing and Systems"},{"name":"Eighth Research Institute of China Aerospace Science and Technology Group Company, Ltd.","award":["USCAST2023-17"],"award-info":[{"award-number":["USCAST2023-17"]}]},{"name":"Eighth Research Institute of China Aerospace Science and Technology Group Company, Ltd.","award":["USCAST2023-21"],"award-info":[{"award-number":["USCAST2023-21"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Comput. Soc. Syst."],"published-print":{"date-parts":[[2024,12]]},"DOI":"10.1109\/tcss.2024.3423749","type":"journal-article","created":{"date-parts":[[2024,8,7]],"date-time":"2024-08-07T17:54:46Z","timestamp":1723053286000},"page":"7941-7951","source":"Crossref","is-referenced-by-count":6,"title":["SLoB: Suboptimal Load Balancing Scheduling in Local Heterogeneous GPU Clusters for Large Language Model Inference"],"prefix":"10.1109","volume":"11","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-3787-5681","authenticated-orcid":false,"given":"Peiwen","family":"Jiang","sequence":"first","affiliation":[{"name":"School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-9360-6084","authenticated-orcid":false,"given":"Haoxin","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9373-8474","authenticated-orcid":false,"given":"Zinuo","family":"Cai","sequence":"additional","affiliation":[{"name":"School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-1579-2778","authenticated-orcid":false,"given":"Lintao","family":"Gao","sequence":"additional","affiliation":[{"name":"Shanghai Aerospace System Engineering Institute, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9800-1068","authenticated-orcid":false,"given":"Weishan","family":"Zhang","sequence":"additional","affiliation":[{"name":"Qingdao Institute of Software, College of Computer Science and Technology, China University of Petroleum (East China), Qingdao, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9592-8490","authenticated-orcid":false,"given":"Ruhui","family":"Ma","sequence":"additional","affiliation":[{"name":"School of Electronic Information and Electrical Engineering, Shanghai Jiao Tong University, Shanghai, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3488-4679","authenticated-orcid":false,"given":"Xiaokang","family":"Zhou","sequence":"additional","affiliation":[{"name":"Business Data Science, Kansai University, Osaka, Japan"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Proc. Adv. Neural Inf. Process. Syst."},{"key":"ref2","article-title":"Scaling laws for neural language models","author":"Kaplan","year":"2020"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2011.2158001"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2007.41"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/MIS.2010.104"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/JIOT.2020.3032544"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2825538"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1016\/j.compind.2017.09.001"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.5040\/9798400614842"},{"key":"ref10","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref11","article-title":"LoRA: Low-rank adaptation of large language models","author":"Hu","year":"2021"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.acl-short.8"},{"key":"ref13","first-page":"499","article-title":"PipeSwitch: Fast pipelined context switching for deep learning applications","author":"Bai","year":"2020","journal-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation"},{"key":"ref14","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019"},{"key":"ref15","article-title":"Fast transformer decoding: One write-head is all you need","author":"Shazeer","year":"2019"},{"key":"ref16","first-page":"443","article-title":"Serving DNNs like clockwork: Performance predictability from the bottom up","author":"Gujarati","year":"2020","journal-title":"Proc. 14th USENIX Symp. Operating Syst. Des. Implementation"},{"key":"ref17","first-page":"787","article-title":"SHEPHERD: Serving $\\{$DNNs$\\}$ in the wild","author":"Zhang","year":"2023","journal-title":"Proc. 20th USENIX Symp. Netw. Syst. Des. Implementation"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3460352"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3587459"},{"key":"ref20","article-title":"A survey of large language models","author":"Zhao","year":"2023"},{"key":"ref21","article-title":"Fast distributed inference serving for large language models","author":"Wu","year":"2023"},{"key":"ref22","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","author":"Yu","year":"2022","journal-title":"Proc. 16th USENIX Symp. Operating Syst. Des. Implementation"},{"key":"ref23","first-page":"65517","article-title":"Response length perception and sequence scheduling: An LLM-empowered llm inference pipeline","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","volume":"36","author":"Zheng","year":"2024"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/mvt.2023.3323757"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3603269.3610856"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-48986-4_311552"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3280970"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3552326.3567508"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613175"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"ref31","article-title":"TensorRT inference with tensorflow","year":"2024"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3010089.3010131"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2022.3227023"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-018-2435-1"},{"key":"ref35","first-page":"473","article-title":"Tetris: Memory-efficient serverless inference through tensor sharing","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Li","year":"2022"},{"key":"ref36","first-page":"69","article-title":"Transparent $\\{$GPU$\\}$ sharing in container clouds for deep learning workloads","author":"Wu","year":"2023","journal-title":"Proc. 20th USENIX Symp. Netw. Syst. Des. Implementation"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3345401"},{"key":"ref38","article-title":"FaaSwap: Slo-aware, GPU-efficient serverless inference via model swapping","author":"Yu","year":"2023"},{"key":"ref39","article-title":"ServerlessLLM: Locality-enhanced serverless inference for large language models","author":"Fu","year":"2024"}],"container-title":["IEEE Transactions on Computational Social Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6570650\/10772353\/10630536.pdf?arnumber=10630536","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,3]],"date-time":"2024-12-03T19:10:38Z","timestamp":1733253038000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10630536\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,12]]},"references-count":39,"journal-issue":{"issue":"6"},"URL":"https:\/\/doi.org\/10.1109\/tcss.2024.3423749","relation":{},"ISSN":["2329-924X","2373-7476"],"issn-type":[{"value":"2329-924X","type":"electronic"},{"value":"2373-7476","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024,12]]}}}