{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T08:38:55Z","timestamp":1770107935546,"version":"3.49.0"},"reference-count":36,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key RD Program of China","award":["2024YDLN0004"],"award-info":[{"award-number":["2024YDLN0004"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cogn. Commun. Netw."],"published-print":{"date-parts":[[2026]]},"DOI":"10.1109\/tccn.2026.3657037","type":"journal-article","created":{"date-parts":[[2026,1,23]],"date-time":"2026-01-23T21:00:32Z","timestamp":1769202032000},"page":"5473-5488","source":"Crossref","is-referenced-by-count":0,"title":["DisHelis: Optimizing Deployment of Disaggregated LLMs Inference Serving Over Heterogeneous Environments via Hierarchical Max-Flow"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-5391-5048","authenticated-orcid":false,"given":"Tao","family":"Zhang","sequence":"first","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7860-4896","authenticated-orcid":false,"given":"Huihuang","family":"Qin","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4026-8338","authenticated-orcid":false,"given":"Dong","family":"Jin","sequence":"additional","affiliation":[{"name":"Institute of Artificial intelligence, Hefei Comprehensive National Science Center, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2817-9738","authenticated-orcid":false,"given":"Shuangwu","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9963-019X","authenticated-orcid":false,"given":"Huasen","family":"He","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7489-2839","authenticated-orcid":false,"given":"Xiaobin","family":"Tan","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]},{"given":"Shiyin","family":"Zhu","sequence":"additional","affiliation":[{"name":"New H3C Technologies Company Ltd., Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7329-4738","authenticated-orcid":false,"given":"Jian","family":"Yang","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, Hefei, Anhui, China"}]}],"member":"263","reference":[{"key":"ref1","article-title":"DeepSeek-V3 technical report","volume-title":"arXiv:2412.19437","author":"Liu","year":"2024"},{"key":"ref2","article-title":"GPT-4 technical report","volume-title":"arXiv:2303.08774","author":"Achiam","year":"2023"},{"key":"ref3","first-page":"1","article-title":"SageAttention: Accurate 8-bit attention for plug-and-play inference acceleration","volume-title":"Proc. Int. Conf. Learn. Represent. (ICLR)","author":"Zhang"},{"key":"ref4","first-page":"1","article-title":"SageAttention2: Efficient attention with thorough outlier smoothing and per-thread INT4 quantization","volume-title":"Proc. Int. Conf. Mach. Learn. (ICML)","author":"Zhang"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TCCN.2025.3528892"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TCCN.2024.3401712"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"ref8","article-title":"Inference without interference: Disaggregate LLM inference for mixed downstream workloads","author":"Hu","year":"2024","journal-title":"arXiv:2401.11181"},{"key":"ref9","first-page":"155","article-title":"Mooncake: Trading more storage for less computation\u2014A KVCache-centric architecture for serving LLM chatbot","volume-title":"Proc. 23rd USENIX Conf. File Storage Technol. (FAST)","author":"Qin"},{"key":"ref10","first-page":"193","article-title":"DistServe: Disaggregating prefill and decoding for goodput-optimized large language model serving","volume-title":"Proc. 18th USENIX Symp. Operating Syst. Design Implement. (OSDI)","author":"Zhong"},{"key":"ref11","article-title":"Megatron-LM: Training multi-billion parameter language models using model parallelism","author":"Shoeybi","year":"2019","journal-title":"arXiv:1909.08053"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2025.3583165"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2024.3391254"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"ref16","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"Proc. 16th USENIX Symp. Operating Syst. Design Implement.","author":"Yu"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3642970.3655843"},{"key":"ref18","first-page":"1","article-title":"SkyPilot: An intercloud broker for sky computing","volume-title":"Proc. 20th USENIX Symp. Networked Syst. Design Implement. (NSDI)","author":"Yang"},{"key":"ref19","first-page":"1","article-title":"ThunderServe: High-performance and cost-efficient LLM serving in cloud environments","volume-title":"Proc. 8th Conf. Mach. Learn. Syst.","author":"Jiang"},{"key":"ref20","article-title":"M\u00e9lange: Cost efficient large language model serving by exploiting GPU heterogeneity","author":"Griggs","year":"2024","journal-title":"arXiv:2404.14527"},{"key":"ref21","first-page":"1","article-title":"HexGen: Generative inference of large language model over heterogeneous environment","volume-title":"Proc. Forty-first Int. Conf. Mach. Learn.","author":"Jiang"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3627535.3638480"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3669940.3707215"},{"key":"ref24","first-page":"18","article-title":"HexGen-2: Disaggregated generative inference of LLMs in heterogeneous environment","volume-title":"Proc. 13th Int. Conf. Learn. Represent.","author":"Jiang"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672274"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3689031.3696098"},{"key":"ref27","first-page":"1","article-title":"Efficient large language models: A survey","volume":"32","author":"Wan","year":"2023","journal-title":"Trans. Mach. Learn. Res."},{"key":"ref28","first-page":"135","article-title":"ServerlessLLM: Low-latency serverless inference for large language models","volume-title":"Proc. 18th USENIX Symp. Operating Syst. Design Implement. (OSDI 24)","author":"Fu"},{"key":"ref29","first-page":"117","article-title":"Taming throughput-latency tradeoff in LLM inference with sarathi-serve","volume-title":"Proc. 18th USENIX Conf. Operating Syst. Design Implement.","author":"Agrawal"},{"key":"ref30","article-title":"LLM inference unveiled: Survey and roofline model insights","author":"Yuan","year":"2024","journal-title":"arXiv:2402.16363"},{"key":"ref31","volume-title":"Open Compute Framework: Peer-to-Peer Task Queue for Foundation Model Inference Serving","author":"Yao","year":"2023"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1016\/0377-2217(92)90077-M"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1016\/j.ipl.2006.06.003"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1007\/s11590-020-01644-6"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-5060(08)70085-2"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1287\/opre.14.4.699"}],"container-title":["IEEE Transactions on Cognitive Communications and Networking"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6687307\/11304002\/11361142.pdf?arnumber=11361142","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T20:45:50Z","timestamp":1770065150000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11361142\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"references-count":36,"URL":"https:\/\/doi.org\/10.1109\/tccn.2026.3657037","relation":{},"ISSN":["2332-7731","2372-2045"],"issn-type":[{"value":"2332-7731","type":"electronic"},{"value":"2372-2045","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]}}}