{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T02:15:44Z","timestamp":1767320144751,"version":"3.48.0"},"publisher-location":"Singapore","reference-count":16,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819550111","type":"print"},{"value":"9789819550128","type":"electronic"}],"license":[{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T00:00:00Z","timestamp":1767225600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-5012-8_34","type":"book-chapter","created":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T02:13:14Z","timestamp":1767319994000},"page":"457-465","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["EdgeInfer-TP: A Collaborative Tensor Parallelism Inference System for\u00a0Heterogeneous Edge Devices"],"prefix":"10.1007","author":[{"given":"Yutao","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Wentao","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Xuerui","family":"Liu","sequence":"additional","affiliation":[]},{"given":"Fengyi","family":"Huang","sequence":"additional","affiliation":[]},{"given":"Wenhua","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Tian","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Weijia","family":"Jia","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2026,1,2]]},"reference":[{"key":"34_CR1","unstructured":"Lee, W., Lee, J., Seo, J., Sim, J.: InfiniGen: efficient generative inference of large language models with dynamic KV cache management. In: 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), pp. 155\u2013172 (2024)"},{"issue":"2","key":"34_CR2","doi-asserted-by":"publisher","first-page":"595","DOI":"10.1109\/TNET.2020.3042320","volume":"29","author":"X Liekang Zeng","year":"2020","unstructured":"Liekang Zeng, X., Chen, Z.Z., Yang, L., Zhang, J.: Coedge: cooperative DNN inference with adaptive workload partitioning over heterogeneous edge devices. IEEE\/ACM Trans. Netw. 29(2), 595\u2013608 (2020)","journal-title":"IEEE\/ACM Trans. Netw."},{"key":"34_CR3","doi-asserted-by":"crossref","unstructured":"Alqahtani, D.K., Cheema, M.A., Toosi, A.N.: Benchmarking deep learning models for object detection on edge computing devices. In: International Conference on Service-Oriented Computing, pp. 142\u2013150. Springer (2024)","DOI":"10.1007\/978-981-96-0805-8_11"},{"issue":"2","key":"34_CR4","doi-asserted-by":"publisher","first-page":"743","DOI":"10.1109\/TSC.2025.3539201","volume":"18","author":"F Huang","year":"2025","unstructured":"Huang, F., et al.: DRMQ: dynamic resource management for enhanced QoS in collaborative edge-edge industrial environments. IEEE Trans. Serv. Comput. 18(2), 743\u2013757 (2025)","journal-title":"IEEE Trans. Serv. Comput."},{"issue":"5","key":"34_CR5","doi-asserted-by":"publisher","first-page":"2174","DOI":"10.1109\/TBDATA.2024.3404104","volume":"11","author":"W Wang","year":"2025","unstructured":"Wang, W., Yang, Q., Liang, Y., Yang, X., Liu, Q., Wang, T.: Heterogeneous device collaboration based federated learning for big data applications. IEEE Trans. Big Data 11(5), 2174\u20132183 (2025)","journal-title":"IEEE Trans. Big Data"},{"key":"34_CR6","doi-asserted-by":"publisher","first-page":"7957","DOI":"10.52202\/079017-0256","volume":"37","author":"RB Prabhakar","year":"2024","unstructured":"Prabhakar, R.B., Zhang, H., Wentzlaff, D.: Inherently parallel transformers for efficient multi-device inference Kraken. Adv. Neural. Inf. Process. Syst. 37, 7957\u20137980 (2024)","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"34_CR7","first-page":"162","volume":"6","author":"X Zhao","year":"2024","unstructured":"Zhao, X., Jia, B., Zhou, H., Liu, Z., Cheng, S., You, Y.: Hetegen: efficient heterogeneous parallel inference for large language models on resource-constrained devices. Proc. Mach. Learn. Syst. 6, 162\u2013172 (2024)","journal-title":"Proc. Mach. Learn. Syst."},{"key":"34_CR8","unstructured":"Shi, L., Zhang, H., Yao, Y., Li, Z., Zhao, H.: Keep the cost down: a review on methods to optimize LLM\u2019s KV-cache consumption. arXiv preprint arXiv:2407.18003 (2024)"},{"key":"34_CR9","unstructured":"Sheng, Y., et al.: Flexgen: high-throughput generative inference of large language models with a single GPU. In: International Conference on Machine Learning, pp. 31094\u201331116. PMLR (2023)"},{"key":"34_CR10","unstructured":"Ge, S., Zhang, Y., Liu, L., Zhang, M., Han, J., Gao, J.: Model tells you what to discard: adaptive KV cache compression for LLMS. arXiv preprint arXiv:2310.01801 (2023)"},{"key":"34_CR11","unstructured":"Kaddour, J., Harris, J., Mozes, M., Bradley, H., Raileanu, R., McHardy, R.: Challenges and applications of large language models. arXiv preprint arXiv:2307.10169 (2023)"},{"key":"34_CR12","doi-asserted-by":"crossref","unstructured":"Chen, C., et al.: Centauri: enabling efficient scheduling for communication-computation overlap in large model training via communication partitioning. In: Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, vol. 3, pp. 178\u2013191 (2024)","DOI":"10.1145\/3620666.3651379"},{"key":"34_CR13","unstructured":"Tadych, B.: Distributed llama (2024). https:\/\/github.com\/b4rtaz\/distributed-llama"},{"key":"34_CR14","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-LM: training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)"},{"key":"34_CR15","unstructured":"D\u00a0Team, Majumder, R., Proskurin, A.: Deepspeed: accelerating large-scale model inference and training via system optimizations and compression. Microsoft Research Blog (2021)"},{"key":"34_CR16","doi-asserted-by":"crossref","unstructured":"Li, Z., Feng, W., Guizani, M., Yu, H.: TPI-LLM: serving 70b-scale LLMS efficiently on low-resource edge devices. arXiv preprint arXiv:2410.00531 (2024)","DOI":"10.1109\/TSC.2025.3596892"}],"container-title":["Lecture Notes in Computer Science","Service-Oriented Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-5012-8_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T02:13:15Z","timestamp":1767319995000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-5012-8_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026]]},"ISBN":["9789819550111","9789819550128"],"references-count":16,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-5012-8_34","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026]]},"assertion":[{"value":"2 January 2026","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ICSOC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Service-Oriented Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Shenzhen","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 December 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 December 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"icsoc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/icsoc2025.hit.edu.cn\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}