{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:33:12Z","timestamp":1767339192937,"version":"3.44.0"},"reference-count":39,"publisher":"IEEE","license":[{"start":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T00:00:00Z","timestamp":1716163200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2024,5,20]],"date-time":"2024-05-20T00:00:00Z","timestamp":1716163200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100006190","name":"Research and Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006190","name":"Research and Development","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100006190","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100003012","name":"Impact Fund","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100003012","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024,5,20]]},"DOI":"10.1109\/infocom52122.2024.10621087","type":"proceedings-article","created":{"date-parts":[[2024,8,12]],"date-time":"2024-08-12T13:25:41Z","timestamp":1723469141000},"page":"1021-1030","source":"Crossref","is-referenced-by-count":2,"title":["OTAS: An Elastic Transformer Serving System via Token Adaptation"],"prefix":"10.1109","author":[{"given":"Jinyu","family":"Chen","sequence":"first","affiliation":[{"name":"The Hong Kong Polytechnic University,Department of Computing"}]},{"given":"Wenchao","family":"Xu","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University,Department of Computing"}]},{"given":"Zicong","family":"Hong","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University,Department of Computing"}]},{"given":"Song","family":"Guo","sequence":"additional","affiliation":[{"name":"The Hong Kong University of Science and Technology,Department of Computer Science and Engineering"}]},{"given":"Haozhao","family":"Wang","sequence":"additional","affiliation":[{"name":"Huazhong University of Science and Technology,School of Computer Science and Technology"}]},{"given":"Jie","family":"Zhang","sequence":"additional","affiliation":[{"name":"The Hong Kong Polytechnic University,Department of Computing"}]},{"given":"Deze","family":"Zeng","sequence":"additional","affiliation":[{"name":"China University of Geosciences,School of Computer Science"}]}],"member":"263","reference":[{"volume-title":"Surpassing nvidia fastertransformer\u2019s inference performance by 50%, open source project powers into the future of large models industrialization. HPC-AI Tech","year":"2022","key":"ref1"},{"volume-title":"Github copilot: Your ai pair programmer. Github","year":"2023","key":"ref2"},{"volume-title":"Introducing chatgpt. OpenAI","year":"2022","key":"ref3"},{"volume-title":"Chatgpt\u2019s growth begins to flatten, up 12.6% from march to april. Similarweb","year":"2023","key":"ref4"},{"doi-asserted-by":"publisher","key":"ref5","DOI":"10.14778\/3570690.3570692"},{"doi-asserted-by":"publisher","key":"ref6","DOI":"10.1145\/3552326.3587438"},{"doi-asserted-by":"publisher","key":"ref7","DOI":"10.1145\/3575693.3575698"},{"key":"ref8","first-page":"397","article-title":"INFaaS: Automated Model-less Inference Serving","volume-title":"USENIX Annual Technical Conference","author":"Romero"},{"year":"2023","author":"Sheng","article-title":"High-throughput generative inference of large language models with a single gpu","key":"ref9"},{"volume-title":"Microsoft","year":"2023","article-title":"What are tokens?","key":"ref10"},{"doi-asserted-by":"publisher","key":"ref11","DOI":"10.48550\/ARXIV.1706.03762"},{"volume-title":"International Conference on Learning Representations","author":"Bolya","article-title":"Token Merging: Your ViT but Faster","key":"ref12"},{"doi-asserted-by":"publisher","key":"ref13","DOI":"10.1007\/978-3-031-19827-4_41"},{"year":"2020","author":"Dosovitskiy","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","key":"ref14"},{"year":"2018","author":"Devlin","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","key":"ref15"},{"issue":"1","key":"ref16","first-page":"5232","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"The Journal of Machine Learning Research"},{"key":"ref17","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"year":"2023","author":"Dehghani","article-title":"Scaling vision transformers to 22 billion parameters","key":"ref18"},{"doi-asserted-by":"publisher","key":"ref19","DOI":"10.1145\/3560815"},{"year":"2021","author":"Bommasani","article-title":"On the opportunities and risks of foundation models","key":"ref20"},{"doi-asserted-by":"publisher","key":"ref21","DOI":"10.18653\/v1\/2022.acl-demo.10"},{"year":"2009","author":"Krizhevsky","article-title":"Learning multiple layers of features from tiny images","key":"ref22"},{"year":"2021","author":"Ridnik","article-title":"Imagenet-21k pretraining for the masses","key":"ref23"},{"key":"ref24","first-page":"613","article-title":"Clipper: A Low-Latency Online Prediction Serving System","volume":"17","author":"Crankshaw","year":"2017","journal-title":"NSDI"},{"key":"ref25","first-page":"183","article-title":"DVABatch: Diversity-aware Multi-Entry Multi-Exit Batching for Efficient Processing of DNN Services on GPUs","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Cui"},{"doi-asserted-by":"publisher","key":"ref26","DOI":"10.1002\/nav.20231"},{"key":"ref27","first-page":"489","article-title":"PetS: A unified framework for Parameter-Efficient transformers serving","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou"},{"year":"2019","author":"Wightman","article-title":"PyTorch Image Models","key":"ref28"},{"doi-asserted-by":"publisher","key":"ref29","DOI":"10.1109\/JSTARS.2019.2918242"},{"volume-title":"Faster and Cheaper Serverless Computing on Harvested Resources","author":"Zhang","first-page":"724","key":"ref30"},{"key":"ref31","first-page":"787","article-title":"SHEPHERD: Serving DNNs in the wild","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhang"},{"year":"2023","author":"Li","article-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving","key":"ref32"},{"doi-asserted-by":"publisher","key":"ref33","DOI":"10.1109\/SC41405.2020.00073"},{"key":"ref34","first-page":"521","article-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu"},{"doi-asserted-by":"publisher","key":"ref35","DOI":"10.1109\/INFOCOM48880.2022.9796853"},{"doi-asserted-by":"publisher","key":"ref36","DOI":"10.1109\/TMC.2022.3189186"},{"volume-title":"NVIDIA","year":"2019","article-title":"Nvidia fastertransformer","key":"ref37"},{"doi-asserted-by":"publisher","key":"ref38","DOI":"10.1109\/INFOCOM48880.2022.9796939"},{"doi-asserted-by":"publisher","key":"ref39","DOI":"10.1109\/INFOCOM48880.2022.9796884"}],"event":{"name":"IEEE INFOCOM 2024 - IEEE Conference on Computer Communications","start":{"date-parts":[[2024,5,20]]},"location":"Vancouver, BC, Canada","end":{"date-parts":[[2024,5,23]]}},"container-title":["IEEE INFOCOM 2024 - IEEE Conference on Computer Communications"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/10621050\/10621073\/10621087.pdf?arnumber=10621087","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,26]],"date-time":"2025-08-26T19:02:26Z","timestamp":1756234946000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10621087\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,20]]},"references-count":39,"URL":"https:\/\/doi.org\/10.1109\/infocom52122.2024.10621087","relation":{},"subject":[],"published":{"date-parts":[[2024,5,20]]}}}