{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T23:14:57Z","timestamp":1769642097553,"version":"3.49.0"},"publisher-location":"Singapore","reference-count":17,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819530601","type":"print"},{"value":"9789819530618","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,13]],"date-time":"2025-11-13T00:00:00Z","timestamp":1762992000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,13]],"date-time":"2025-11-13T00:00:00Z","timestamp":1762992000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-981-95-3061-8_27","type":"book-chapter","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T05:02:27Z","timestamp":1762923747000},"page":"257-265","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["A Review of\u00a0Optimization Techniques for\u00a0Large Language Model Inference"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-8832-5859","authenticated-orcid":false,"given":"Yujia","family":"Cao","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3574-028X","authenticated-orcid":false,"given":"Xi","family":"Tao","sequence":"additional","affiliation":[]},{"given":"Weipeng","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Chuanfei","family":"Xu","sequence":"additional","affiliation":[]},{"given":"Zhong","family":"Ming","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,13]]},"reference":[{"key":"27_CR1","unstructured":"Agrawal, A., Panwar, A., Mohan, J., Kwatra, N., Gulavani, B.S., Ramjee, R.: SARATHI: efficient LLM inference by piggybacking decodes with chunked prefills. arXiv preprint arXiv:2308.16369 (2023)"},{"key":"27_CR2","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Lee-Thorp, J., de\u00a0Jong, M., Zemlyanskiy, Y., Lebron, F., Sanghai, S.: GQA: training generalized multi-query transformer models from multi-head checkpoints. In: Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing, pp. 4895\u20134901 (2023)","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"issue":"1","key":"27_CR3","doi-asserted-by":"publisher","first-page":"126","DOI":"10.1109\/MSP.2017.2765695","volume":"35","author":"Y Cheng","year":"2018","unstructured":"Cheng, Y., Wang, D., Zhou, P., Zhang, T.: Model compression and acceleration for deep neural networks: the principles, progress, and challenges. IEEE Signal Process. Mag. 35(1), 126\u2013136 (2018)","journal-title":"IEEE Signal Process. Mag."},{"key":"27_CR4","unstructured":"Dao, T.: FlashAttention-2: faster attention with better parallelism and work partitioning. arXiv preprint arXiv:2307.08691 (2023)"},{"key":"27_CR5","unstructured":"Dao, T., Fu, D., Ermon, S., Rudra, A., R\u00e9, C.: FlashAttention: fast and memory-efficient exact attention with IO-awareness. In: Advances in Neural Information Processing Systems, vol. 35, pp. 16344\u201316359 (2022)"},{"key":"27_CR6","unstructured":"Del\u00a0Corro, L., Del\u00a0Giorno, A., Agarwal, S., Yu, B., Awadallah, A., Mukherjee, S.: SkipDecode: autoregressive skip decoding with batching and caching for efficient LLM inference. arXiv preprint arXiv:2307.02628 (2023)"},{"key":"27_CR7","doi-asserted-by":"crossref","unstructured":"Hooper, C., et al.: KVQuant: towards 10 million context length LLM inference with KV cache quantization. In: Advances in Neural Information Processing Systems, vol. 37, pp. 1270\u20131303 (2024)","DOI":"10.52202\/079017-0040"},{"key":"27_CR8","doi-asserted-by":"crossref","unstructured":"Kwon, W., et al.: Efficient memory management for large language model serving with PagedAttention. In: Proceedings of the 29th Symposium on Operating Systems Principles, pp. 611\u2013626 (2023)","DOI":"10.1145\/3600006.3613165"},{"key":"27_CR9","unstructured":"Lin, B., et\u00a0al.: Infinite-LLM: efficient LLM service for long context with DistAttention and distributed KVCache. arXiv preprint cs.DC\/2401.02669 (2024)"},{"key":"27_CR10","unstructured":"Liu, H., Zaharia, M., Abbeel, P.: Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:2310.01889 (2023)"},{"key":"27_CR11","unstructured":"Mishra, R., Gupta, H.P., Dutta, T.: A survey on deep neural network compression: challenges, overview, and solutions. arXiv preprint arXiv:2010.03954 (2020)"},{"key":"27_CR12","unstructured":"Shazeer, N.: Fast transformer decoding: one write-head is all you need. arXiv preprint arXiv:1911.02150 (2019)"},{"issue":"4","key":"27_CR13","doi-asserted-by":"publisher","first-page":"2167","DOI":"10.1109\/COMST.2020.3007787","volume":"22","author":"Y Shi","year":"2020","unstructured":"Shi, Y., Yang, K., Jiang, T., Zhang, J., Letaief, K.B.: Communication-efficient edge AI: algorithms and systems. IEEE Commun. Surv. Tutorials 22(4), 2167\u20132191 (2020)","journal-title":"IEEE Commun. Surv. Tutorials"},{"key":"27_CR14","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"27_CR15","unstructured":"Xiao, G., Lin, J., Seznec, M., Wu, H., Demouth, J., Han, S.: SmoothQuant: accurate and efficient post-training quantization for large language models. In: International Conference on Machine Learning, pp. 38087\u201338099. PMLR (2023)"},{"key":"27_CR16","unstructured":"Yao, Z., Yazdani Aminabadi, R., Zhang, M., Wu, X., Li, C., He, Y.: ZeroQuant: efficient and affordable post-training quantization for large-scale transformers. In: Advances in Neural Information Processing Systems, vol. 35, pp. 27168\u201327183 (2022)"},{"key":"27_CR17","unstructured":"Yu, G.I., Jeong, J.S., Kim, G.W., Kim, S., Chun, B.G.: ORCA: a distributed serving system for transformer-based generative models. In: 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pp. 521\u2013538 (2022)"}],"container-title":["Lecture Notes in Computer Science","Knowledge Science, Engineering and Management"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-95-3061-8_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,28]],"date-time":"2026-01-28T12:13:59Z","timestamp":1769602439000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-95-3061-8_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,13]]},"ISBN":["9789819530601","9789819530618"],"references-count":17,"URL":"https:\/\/doi.org\/10.1007\/978-981-95-3061-8_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,13]]},"assertion":[{"value":"13 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"KSEM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Knowledge Science, Engineering and Management","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Macao","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 August 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"7 August 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"18","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"ksem2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/ksem2025.scimeeting.cn\/en\/web\/index\/27434","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}