{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T14:40:08Z","timestamp":1750603208107,"version":"3.41.0"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"value":"9789819687244","type":"print"},{"value":"9789819687251","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,1]],"date-time":"2025-01-01T00:00:00Z","timestamp":1735689600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-96-8725-1_25","type":"book-chapter","created":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T14:23:25Z","timestamp":1750602205000},"page":"307-317","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Parallelization Techniques for\u00a0Large Language Models: A Review from\u00a0Training to\u00a0Inference"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-0048-7589","authenticated-orcid":false,"given":"Shanwen","family":"Liu","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3574-028X","authenticated-orcid":false,"given":"Xi","family":"Tao","sequence":"additional","affiliation":[]},{"given":"Weipeng","family":"Cao","sequence":"additional","affiliation":[]},{"given":"Zhong","family":"Ming","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,6,21]]},"reference":[{"key":"25_CR1","unstructured":"Bai, J., et al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"25_CR2","unstructured":"Duan, J., et al.: Efficient training of large language models on distributed infrastructures: a survey. arXiv preprint arXiv:2407.20018 (2024)"},{"key":"25_CR3","doi-asserted-by":"crossref","unstructured":"Hu, C., et al.: Inference without interference: disaggregate LLM inference for mixed downstream workloads. arXiv preprint arXiv:2401.11181 (2024)","DOI":"10.1145\/3732941"},{"key":"25_CR4","unstructured":"Huang, Y., et al.: Gpipe: efficient training of giant neural networks using pipeline parallelism. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"issue":"1","key":"25_CR5","doi-asserted-by":"publisher","first-page":"79","DOI":"10.1162\/neco.1991.3.1.79","volume":"3","author":"RA Jacobs","year":"1991","unstructured":"Jacobs, R.A., Jordan, M.I., Nowlan, S.J., Hinton, G.E.: Adaptive mixtures of local experts. Neural Comput. 3(1), 79\u201387 (1991)","journal-title":"Neural Comput."},{"key":"25_CR6","doi-asserted-by":"crossref","unstructured":"Jacobs, S.A., et al.: Deepspeed ulysses: system optimizations for enabling training of extreme long sequence transformer models. arXiv preprint arXiv:2309.14509 (2023)","DOI":"10.1109\/IPDPSW63119.2024.00208"},{"key":"25_CR7","unstructured":"Jeon, B., et al.: Graphpipe: improving performance and scalability of DNN training with graph pipeline parallelism (2024). https:\/\/arxiv.org\/abs\/2406.17145"},{"key":"25_CR8","first-page":"341","volume":"5","author":"VA Korthikanti","year":"2023","unstructured":"Korthikanti, V.A., et al.: Reducing activation recomputation in large transformer models. Proc. Mach. Learn. Syst. 5, 341\u2013353 (2023)","journal-title":"Proc. Mach. Learn. Syst."},{"key":"25_CR9","unstructured":"Kwon, W., et al.: Efficient memory management for large language model serving with pagedattention (2023). https:\/\/arxiv.org\/abs\/2309.06180"},{"key":"25_CR10","unstructured":"Lepikhin, D., et al.: Gshard: scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)"},{"key":"25_CR11","doi-asserted-by":"crossref","unstructured":"Li, S., et al.: Pytorch distributed: experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704 (2020)","DOI":"10.14778\/3415478.3415530"},{"key":"25_CR12","unstructured":"Li, S., Xue, F., Baranwal, C., Li, Y., You, Y.: Sequence parallelism: long sequence training from system perspective. arXiv preprint arXiv:2105.13120 (2021)"},{"issue":"8","key":"25_CR13","doi-asserted-by":"publisher","first-page":"2377","DOI":"10.1109\/TPDS.2023.3281931","volume":"34","author":"P Liang","year":"2023","unstructured":"Liang, P., et al.: A survey on auto-parallelism of large-scale deep learning training. IEEE Trans. Parallel Distrib. Syst. 34(8), 2377\u20132390 (2023)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"25_CR14","unstructured":"Lin, B., et al.: Infinite-LLM: efficient LLM service for long context with distattention and distributed kvcache. arXiv preprint arXiv:2401.02669 (2024)"},{"key":"25_CR15","unstructured":"Liu, A., et al.: Deepseek-v3 technical report. arXiv preprint arXiv:2412.19437 (2024)"},{"key":"25_CR16","unstructured":"Liu, J., et al.: Exploring and evaluating real-world cxl: use cases and system adoption. arXiv preprint arXiv:2405.14209 (2024)"},{"key":"25_CR17","unstructured":"Liu, Y., Li, S., Fang, J., Shao, Y., Yao, B., You, Y.: Colossal-auto: unified automation of parallelization and activation checkpoint for large-scale models. arXiv preprint arXiv:2302.02599 (2023)"},{"key":"25_CR18","doi-asserted-by":"crossref","unstructured":"Miao, X., et al.: Spotserve: serving generative large language models on preemptible instances. In: Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, vol. 2, pp. 1112\u20131127 (2024)","DOI":"10.1145\/3620665.3640411"},{"key":"25_CR19","doi-asserted-by":"crossref","unstructured":"Narayanan, D., et al.: Pipedream: generalized pipeline parallelism for DNN training. In: Proceedings of the 27th ACM Symposium on Operating Systems Principles, pp. 1\u201315 (2019)","DOI":"10.1145\/3341301.3359646"},{"key":"25_CR20","doi-asserted-by":"crossref","unstructured":"Patel, P., et al.: Splitwise: efficient generative LLM inference using phase splitting. In: 2024 ACM\/IEEE 51st Annual International Symposium on Computer Architecture (ISCA), pp. 118\u2013132. IEEE (2024)","DOI":"10.1109\/ISCA59077.2024.00019"},{"key":"25_CR21","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I., et\u00a0al.: Improving language understanding by generative pre-training (2018)"},{"key":"25_CR22","doi-asserted-by":"crossref","unstructured":"Rajbhandari, S., Rasley, J., Ruwase, O., He, Y.: Zero: memory optimizations toward training trillion parameter models (2020). https:\/\/arxiv.org\/abs\/1910.02054","DOI":"10.1109\/SC41405.2020.00024"},{"key":"25_CR23","doi-asserted-by":"crossref","unstructured":"Rasley, J., Rajbhandari, S., Ruwase, O., He, Y.: Deepspeed: system optimizations enable training deep learning models with over 100 billion parameters. In: Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, pp. 3505\u20133506 (2020)","DOI":"10.1145\/3394486.3406703"},{"key":"25_CR24","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., Catanzaro, B.: Megatron-lm: training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)"},{"key":"25_CR25","unstructured":"Touvron, H., et al.: Llama: open and efficient foundation language models. arXiv preprint arXiv:2302.13971 (2023)"},{"key":"25_CR26","doi-asserted-by":"crossref","unstructured":"Wang, B., Xu, Q., Bian, Z., You, Y.: Tesseract: parallelize the tensor parallelism efficiently. In: Proceedings of the 51st International Conference on Parallel Processing, pp. 1\u201311 (2022)","DOI":"10.1145\/3545008.3545087"},{"key":"25_CR27","doi-asserted-by":"crossref","unstructured":"Zhao, Y., et al.: Pytorch FSDP: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277 (2023)","DOI":"10.14778\/3611540.3611569"},{"key":"25_CR28","unstructured":"Zhong, Y., et al.: Distserve: disaggregating prefill and decoding for goodput-optimized large language model serving. arXiv preprint arXiv:2401.09670 (2024)"},{"key":"25_CR29","unstructured":"Zhou, Z., et al.: A survey on efficient inference for large language models. arXiv preprint arXiv:2404.14294 (2024)"}],"container-title":["Lecture Notes in Computer Science","Wireless Artificial Intelligent Computing Systems and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-96-8725-1_25","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,22]],"date-time":"2025-06-22T14:23:31Z","timestamp":1750602211000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-96-8725-1_25"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025]]},"ISBN":["9789819687244","9789819687251"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-96-8725-1_25","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025]]},"assertion":[{"value":"21 June 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"WASA","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Wireless Artificial Intelligent Computing Systems and Applications","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tokyo","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Japan","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"24 June 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"26 June 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"19","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"wasa2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/wasa-conference.org\/WASA2025\/index.html#","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}