{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T07:01:49Z","timestamp":1761894109949,"version":"build-2065373602"},"publisher-location":"Singapore","reference-count":27,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819527243","type":"print"},{"value":"9789819527250","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,1]],"date-time":"2025-11-01T00:00:00Z","timestamp":1761955200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-2725-0_16","type":"book-chapter","created":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:19:22Z","timestamp":1761887962000},"page":"244-264","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Towards Coarse-to-Fine Evaluation of\u00a0Inference Efficiency for\u00a0Large Language Models"],"prefix":"10.1007","author":[{"given":"Yushuo","family":"Chen","sequence":"first","affiliation":[]},{"given":"Tianyi","family":"Tang","sequence":"additional","affiliation":[]},{"given":"Erge","family":"Xiang","sequence":"additional","affiliation":[]},{"given":"Linjiang","family":"Li","sequence":"additional","affiliation":[]},{"given":"Wayne Xin","family":"Zhao","sequence":"additional","affiliation":[]},{"given":"Jing","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Yunpeng","family":"Chai","sequence":"additional","affiliation":[]},{"given":"Ji-Rong","family":"Wen","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,1]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Lee-Thorp, J., de\u00a0Jong, M., Zemlyanskiy, Y., Lebr\u00f3n, F., Sanghai, S.: GQA: training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"16_CR2","unstructured":"Chiang, W.L., et al.: Vicuna: an open-source chatbot impressing GPT-4 with 90%* chatgpt quality (2023). https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/"},{"key":"16_CR3","unstructured":"Contributors, H.F.: huggingface\/text-generation-inference: Large language model text generation inference (2023). https:\/\/github.com\/huggingface\/text-generation-inference"},{"key":"16_CR4","unstructured":"Contributors, L.: Lmdeploy: a toolkit for compressing, deploying, and serving LLM (2023). https:\/\/github.com\/InternLM\/lmdeploy"},{"key":"16_CR5","unstructured":"Dao, T.: Flashattention-2: faster attention with better parallelism and work partitioning. CoRR (2023)"},{"key":"16_CR6","unstructured":"Dao, T., Fu, D.Y., Ermon, S., Rudra, A., R\u00e9, C.: Flashattention: fast and memory-efficient exact attention with IO-awareness. In: Proceedings of NeurIPS (2022)"},{"key":"16_CR7","doi-asserted-by":"publisher","unstructured":"Jiang, A.Q., et al.: Mistral 7b. CoRR abs\/2310.06825 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2310.06825","DOI":"10.48550\/ARXIV.2310.06825"},{"key":"16_CR8","doi-asserted-by":"publisher","unstructured":"Kim, S., et al.: Full stack optimization of transformer inference: a survey. CoRR abs\/2302.14017 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2302.14017","DOI":"10.48550\/ARXIV.2302.14017"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Kwon, W., et al.: Efficient memory management for large language model serving with pagedattention. In: Proceedings of the 29th Symposium on Operating Systems Principles, SOSP 2023, Koblenz, Germany, 23\u201326 October 2023 (2023)","DOI":"10.1145\/3600006.3613165"},{"key":"16_CR10","doi-asserted-by":"publisher","unstructured":"Luebke, D., et al.: GPGPU: general purpose computation on graphics hardware. In: International Conference on Computer Graphics and Interactive Techniques, SIGGRAPH 2004, Los Angeles, California, USA, 8\u201312 August 2004, Course Notes, p.\u00a033. ACM (2004). https:\/\/doi.org\/10.1145\/1103900.1103933","DOI":"10.1145\/1103900.1103933"},{"key":"16_CR11","doi-asserted-by":"publisher","unstructured":"Miao, X., et al.: Towards efficient generative large language model serving: a survey from algorithms to systems. CoRR abs\/2312.15234 (2023). https:\/\/doi.org\/10.48550\/ARXIV.2312.15234","DOI":"10.48550\/ARXIV.2312.15234"},{"key":"16_CR12","unstructured":"Microsoft, T.: microsoft\/deepspeed-mii: Mii makes low-latency and high-throughput inference possible, powered by deepspeed (2023). https:\/\/github.com\/microsoft\/DeepSpeed-MII"},{"key":"16_CR13","unstructured":"ModelTC, T.: Modeltc\/lightllm: Lightllm is a python-based LLM (large language model) inference and serving framework, notable for its lightweight design, easy scalability, and high-speed performance (2023). https:\/\/github.com\/ModelTC\/lightllm"},{"key":"16_CR14","unstructured":"NVIDIA, T.: Nvidia\/fastertransformer: Transformer related optimization, including BERT, GPT (2021). https:\/\/github.com\/NVIDIA\/FasterTransformer"},{"key":"16_CR15","unstructured":"NVIDIA, T.: Nvidia\/tensorrt-llm: Tensorrt-llm provides users with an easy-to-use python api to define large language models (llms) and build tensorrt engines that contain state-of-the-art optimizations to perform inference efficiently on nvidia gpus. tensorrt-llm also contains components to create python and C++ runtimes that execute those tensorrt engines (2023). https:\/\/github.com\/NVIDIA\/TensorRT-LLM"},{"key":"16_CR16","doi-asserted-by":"publisher","unstructured":"Pope, R., et al.: Efficiently scaling transformer inference. CoRR abs\/2211.05102 (2022). https:\/\/doi.org\/10.48550\/ARXIV.2211.05102","DOI":"10.48550\/ARXIV.2211.05102"},{"key":"16_CR17","unstructured":"ShareGPT, T.: ShareGPT: share your wildest chatgpt conversations with one click (2023). https:\/\/sharegpt.com\/"},{"key":"16_CR18","unstructured":"Shazeer, N.: GLU variants improve transformer abs\/2002.05202 (2020)"},{"key":"16_CR19","unstructured":"Shi, S., Zhao, E., Cai, D., Cui, L., Huang, X., Li, H.: Inferflow: an efficient and highly configurable inference engine for large language models (2024)"},{"key":"16_CR20","unstructured":"Taori, R., et al.: Stanford alpaca: an instruction-following llama model (2023). https:\/\/github.com\/tatsu-lab\/stanford_alpaca"},{"key":"16_CR21","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. CoRR (2023)"},{"key":"16_CR22","unstructured":"Touvron, H., et al.: Llama 2: open foundation and fine-tuned chat models. CoRR (2023)"},{"key":"16_CR23","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceeding of NeurIPS (2017)"},{"key":"16_CR24","unstructured":"Wolf, T., et al.: Transformers: state-of-the-art natural language processing. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pp. 38\u201345. Association for Computational Linguistics, Online (2020). https:\/\/www.aclweb.org\/anthology\/2020.emnlp-demos.6"},{"key":"16_CR25","unstructured":"Xiao, G., Tian, Y., Chen, B., Han, S., Lewis, M.: Efficient streaming language models with attention sinks. arXiv (2023)"},{"key":"16_CR26","unstructured":"Zhang, B., Sennrich, R.: Root mean square layer normalization. In: Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, Vancouver, BC, Canada, 8\u201314 December 2019, pp. 12360\u201312371 (2019)"},{"key":"16_CR27","unstructured":"Zhao, W.X., et al.: A survey of large language models. arXiv preprint arXiv:2303.18223 (2023)"}],"container-title":["Lecture Notes in Computer Science","Chinese Computational Linguistics"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-2725-0_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,31]],"date-time":"2025-10-31T05:19:31Z","timestamp":1761887971000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-2725-0_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,1]]},"ISBN":["9789819527243","9789819527250"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-2725-0_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,1]]},"assertion":[{"value":"1 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCL","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China National Conference on Chinese Computational Linguistics","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Jinan","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cncl2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/link.springer.com\/conference\/cncl","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}