{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,18]],"date-time":"2026-05-18T09:37:53Z","timestamp":1779097073984,"version":"3.51.4"},"reference-count":16,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,4,21]],"date-time":"2024-04-21T00:00:00Z","timestamp":1713657600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,4,21]],"date-time":"2024-04-21T00:00:00Z","timestamp":1713657600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62171064"],"award-info":[{"award-number":["62171064"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,4,21]]},"DOI":"10.1109\/wcnc57260.2024.10571127","type":"proceedings-article","created":{"date-parts":[[2024,7,3]],"date-time":"2024-07-03T17:25:49Z","timestamp":1720027549000},"page":"1-6","source":"Crossref","is-referenced-by-count":11,"title":["Edge Intelligence Optimization for Large Language Model Inference with Batching and Quantization"],"prefix":"10.1109","author":[{"given":"Xinyuan","family":"Zhang","sequence":"first","affiliation":[{"name":"BUPT,State Key Laboratory of Networking and Switching Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jiang","family":"Liu","sequence":"additional","affiliation":[{"name":"BUPT,State Key Laboratory of Networking and Switching Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zehui","family":"Xiong","sequence":"additional","affiliation":[{"name":"Information Systems Technology and Design Pillar, SUTD,Singapore"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yudong","family":"Huang","sequence":"additional","affiliation":[{"name":"BUPT,State Key Laboratory of Networking and Switching Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gaochang","family":"Xie","sequence":"additional","affiliation":[{"name":"BUPT,State Key Laboratory of Networking and Switching Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ran","family":"Zhang","sequence":"additional","affiliation":[{"name":"BUPT,State Key Laboratory of Networking and Switching Technology,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3593013.3594067"},{"key":"ref2","article-title":"A survey of large language models","author":"Zhao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/comst.2024.3353265"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2019.2918951"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2021.3125949"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2022.3172402"},{"key":"ref7","volume-title":"GPT-4 architecture, datasets, costs and more leaked","author":"Schreiner"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TWC.2019.2946140"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref10","article-title":"A comprehensive study on post-training quantization for large language models","author":"Yao","year":"2023","journal-title":"arXiv preprint"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2023.3242724"},{"key":"ref12","volume-title":"[Online]"},{"key":"ref13","first-page":"521","article-title":"Orca: A distributed serving system for transformer-based generative models","volume-title":"OSDI 22","author":"Yu","year":"2022"},{"key":"ref14","article-title":"High-throughput generative inference of large language models with a single GPU","author":"Sheng","year":"2023","journal-title":"arXiv preprint"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.cor.2021.105693"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TMC.2021.3120520"}],"event":{"name":"2024 IEEE Wireless Communications and Networking Conference (WCNC)","location":"Dubai, United Arab Emirates","start":{"date-parts":[[2024,4,21]]},"end":{"date-parts":[[2024,4,24]]}},"container-title":["2024 IEEE Wireless Communications and Networking Conference (WCNC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10570463\/10570502\/10571127.pdf?arnumber=10571127","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,6]],"date-time":"2024-07-06T05:09:43Z","timestamp":1720242583000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10571127\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,21]]},"references-count":16,"URL":"https:\/\/doi.org\/10.1109\/wcnc57260.2024.10571127","relation":{},"subject":[],"published":{"date-parts":[[2024,4,21]]}}}