{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,17]],"date-time":"2025-09-17T16:42:02Z","timestamp":1758127322010,"version":"3.28.0"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,6,19]],"date-time":"2024-06-19T00:00:00Z","timestamp":1718755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,6,19]],"date-time":"2024-06-19T00:00:00Z","timestamp":1718755200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,6,19]]},"DOI":"10.1109\/iwqos61813.2024.10682949","type":"proceedings-article","created":{"date-parts":[[2024,9,26]],"date-time":"2024-09-26T17:41:00Z","timestamp":1727372460000},"page":"1-10","source":"Crossref","is-referenced-by-count":1,"title":["SyncIntellects: Orchestrating LLM Inference with Progressive Prediction and QoS-Friendly Control"],"prefix":"10.1109","author":[{"given":"Xue","family":"Lin","sequence":"first","affiliation":[{"name":"Nankai University,Tianjin,China"}]},{"given":"Zhibo","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nankai University,Tianjin,China"}]},{"given":"Peining","family":"Yue","sequence":"additional","affiliation":[{"name":"State Grid Tianjin Electronic Power Company,Tianjin,China"}]},{"given":"Haoran","family":"Li","sequence":"additional","affiliation":[{"name":"Nankai University,Tianjin,China"}]},{"given":"Jin","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nankai University,Tianjin,China"}]},{"given":"Baoyu","family":"Fan","sequence":"additional","affiliation":[{"name":"Nankai University,Tianjin,China"}]},{"given":"Huayou","family":"Su","sequence":"additional","affiliation":[{"name":"National University of Defense Technology,Hunan,China"}]},{"given":"Xiaoli","family":"Gong","sequence":"additional","affiliation":[{"name":"Nankai University,Tianjin,China"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3406095"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.eacl-main.24"},{"article-title":"A deep reinforced model for abstractive summarization","volume-title":"International Conference on Learning Representations(ICLR)","author":"Paulus","key":"ref3"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-industry.74"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref7","first-page":"5731","article-title":"Boosting the throughput and accelerator utilization of specialized cnn inference beyond increasing batch size","volume-title":"International Conference on Machine Learning (ICML)","author":"Kosaian"},{"article-title":"Response length perception and sequence scheduling: An llm-empowered llm inference pipeline","year":"2023","author":"Zheng","key":"ref8"},{"article-title":"S3: Increasing gpu utilization during generative inference for higher throughput","year":"2023","author":"Jin","key":"ref9"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3409256.3409823"},{"issue":"1","key":"ref11","first-page":"5485","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"The Journal of Machine Learning Research"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3560815"},{"article-title":"Apt: Adaptive pruning and tuning pretrained language models for efficient training and inference","volume-title":"Proc. ACM Interact. Mob. Wearable Ubiquitous Technol (IMWUT)","author":"Zhao","key":"ref13"},{"key":"ref14","first-page":"31094","article-title":"Flexgen: High-throughput generative inference of large language models with a single gpu","volume-title":"International Conference on Machine Learning (ICML)","author":"Sheng"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-4009"},{"article-title":"Fast transformer decoding: One write-head is all you need","year":"2019","author":"Shazeer","key":"ref16"},{"article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","year":"2019","author":"Shoeybi","key":"ref17"},{"article-title":"Tensorflow-serving: Flexible, high-performance ml serving","volume-title":"Workshop on ML Systems at NIPS 2017","author":"Olston","key":"ref18"},{"key":"ref19","article-title":"Pytorch: An imperative style, high-performance deep learning library","volume":"32","author":"Paszke","year":"2019","journal-title":"Advances in neural information processing systems (NIPS)"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/S0377-2217(99)00153-8"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1016\/j.ejor.2021.06.012"},{"key":"ref23","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI)","author":"Yu"},{"issue":"1","key":"ref24","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1186\/s42774-020-00055-6","article-title":"Predicting running time of aerodynamic jobs in hpc system by combining supervised and unsupervised learning method","volume":"3","author":"Wang","year":"2021","journal-title":"Advances in Aerodynamics"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1186\/s40537-022-00623-1"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/2612669.2612675"},{"key":"ref27","first-page":"389","article-title":"Predicting and using target length in neural machine translation","volume-title":"the 1st Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 10th International Joint Conference on Natural Language Processing","author":"Yang"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.319"},{"article-title":"Chat-rec: Towards interactive and explainable llms-augmented recommender system","year":"2023","author":"Gao","key":"ref29"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00276"},{"article-title":"Llama 2: Open foundation and fine-tuned chat models","year":"2023","author":"Touvron","key":"ref31"},{"key":"ref32","article-title":"Job scheduling for multi-user mapreduce clusters","volume-title":"Technical Report UCB\/EECS-2009-55","author":"Zaharia","year":"2009"},{"article-title":"A prompt pattern catalog to enhance prompt engineering with chatgpt","year":"2023","author":"White","key":"ref33"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.3390\/aerospace10030279"},{"article-title":"Flashdecoding++: Faster large language model inference on gpus","year":"2023","author":"Hong","key":"ref35"},{"key":"ref36","article-title":"Best practices for prompt engineering with openai api","volume-title":"OpenAI","author":"Shieh","year":"2023"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00988"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P19-1285"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2023.127063"},{"volume-title":"Llama2","year":"2023","author":"Hugo","key":"ref40"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P17-1147"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-2124"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1259"},{"article-title":"Measuring massive multitask language understanding","volume-title":"International Conference on Learning Representations","author":"Hendrycks","key":"ref44"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1082"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/3474381"},{"article-title":"Bertscore: Evaluating text generation with bert","volume-title":"International Conference on Learning Representations","author":"Zhang","key":"ref47"},{"key":"ref48","first-page":"60","article-title":"Chateval: A tool for chatbot evaluation","volume-title":"the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)","author":"Sedoc"},{"key":"ref49","first-page":"24824","article-title":"Chain-of-thought prompting elicits reasoning in large language models","volume":"35","author":"Wei","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00205"}],"event":{"name":"2024 IEEE\/ACM 32nd International Symposium on Quality of Service (IWQoS)","start":{"date-parts":[[2024,6,19]]},"location":"Guangzhou, China","end":{"date-parts":[[2024,6,21]]}},"container-title":["2024 IEEE\/ACM 32nd International Symposium on Quality of Service (IWQoS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10682818\/10682608\/10682949.pdf?arnumber=10682949","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,9,27]],"date-time":"2024-09-27T04:41:16Z","timestamp":1727412076000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10682949\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,19]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/iwqos61813.2024.10682949","relation":{},"subject":[],"published":{"date-parts":[[2024,6,19]]}}}