{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T07:11:05Z","timestamp":1771657865507,"version":"3.50.1"},"reference-count":31,"publisher":"Zhejiang University Press","issue":"5","license":[{"start":{"date-parts":[[2025,4,2]],"date-time":"2025-04-02T00:00:00Z","timestamp":1743552000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,4,2]],"date-time":"2025-04-02T00:00:00Z","timestamp":1743552000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Front Inform Technol Electron Eng"],"published-print":{"date-parts":[[2025,5]]},"DOI":"10.1631\/fitee.2400602","type":"journal-article","created":{"date-parts":[[2025,4,3]],"date-time":"2025-04-03T22:54:46Z","timestamp":1743720886000},"page":"770-787","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Memory-efficient tensor parallelism for long-sequence Transformer training","\u9762\u5411\u957f\u5e8f\u5217 Transformer \u8bad\u7ec3\u7684\u5185\u5b58\u9ad8\u6548\u5f20\u91cf\u5e76\u884c\u65b9\u6cd5"],"prefix":"10.1631","volume":"26","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5590-5179","authenticated-orcid":false,"given":"Peng","family":"Liang","sequence":"first","affiliation":[]},{"given":"Linbo","family":"Qiao","sequence":"additional","affiliation":[]},{"given":"Yanqi","family":"Shi","sequence":"additional","affiliation":[]},{"given":"Hao","family":"Zheng","sequence":"additional","affiliation":[]},{"given":"Yu","family":"Tang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9743-2034","authenticated-orcid":false,"given":"Dongsheng","family":"Li","sequence":"additional","affiliation":[]}],"member":"635","published-online":{"date-parts":[[2025,4,2]]},"reference":[{"key":"ref1","author":"Achiam","year":"2023","journal-title":"GPT-4 technical report"},{"key":"ref2","author":"Beltagy","year":"2020","journal-title":"Longformer: the long-document Transformer"},{"key":"ref3","article-title":"Language models are few-shot learners","volume-title":"Proc 34th Int Conf on Neural Information Processing Systems","author":"Brown","year":"2020"},{"key":"ref4","first-page":"578","article-title":"TVM: an auto-mated end-to-end optimizing compiler for deep learning","author":"Chen","year":"2018","journal-title":"13th USENIX Symp on Operating Systems Design and Implementation"},{"issue":"1","key":"ref5","first-page":"240","article-title":"PaLM: scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2022","journal-title":"J Mach Learn Res"},{"key":"ref6","article-title":"FlashAttention-2: faster attention with better parallelism and work partitioning","volume-title":"Proc 12th Int Conf on Learning Representations","author":"Dao","year":"2024"},{"key":"ref7","article-title":"FlashAttention: fast and memory-efficient exact attention with IO-awareness","volume-title":"Proc 36th Int Conf on Neural Information Processing Systems","author":"Dao","year":"2022"},{"key":"ref8","first-page":"4171","article-title":"BERT: pre-training of deep bidirectional Transformers for language under-standing","volume-title":"Proc Conf of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies","author":"Devlin","year":"2019"},{"key":"ref9","article-title":"GPipe: efficient training of giant neural networks using pipeline parallelism","volume-title":"Proc 33rd Int Conf on Neural Information Processing Systems","author":"Huang","year":"2019"},{"key":"ref10","author":"Huang","year":"2023","journal-title":"Advancing Transformer architecture in long-context large language models: a comprehensive survey"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/3662158.3662806"},{"key":"ref12","author":"Kaddour","year":"2023","journal-title":"Challenges and applications of large language models"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1406.3269"},{"key":"ref14","article-title":"Reducing activation recomputation in large Transformer models","volume-title":"Proc 6th Conf on Machine Learning and Systems","author":"Korthikanti","year":"2023"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2023.3247001"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2019.2928289"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.acl-long.134"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2023.3281931"},{"key":"ref19","author":"Liu","year":"2023","journal-title":"Ring Attention with blockwise Transformers for near-infinite context"},{"key":"ref20","author":"Liu","year":"2024","journal-title":"Sora: a review on background, technology, limitations, and opportunities of large vision models"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"ref23","first-page":"7937","article-title":"Memory-efficient pipeline-parallel DNN training","volume-title":"Proc 38th Int Conf on Machine Learning","author":"Narayanan","year":"2021b"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.52202\/079017-2193"},{"issue":"1","key":"ref27","first-page":"1929","article-title":"Dropout: a simple way to prevent neural networks from overfit-ting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J Mach Learn Res"},{"key":"ref28","author":"Tarassow","year":"2023","journal-title":"The potential of LLMs for coding with low-resource and domain-specific programming languages"},{"key":"ref29","author":"Touvron","year":"2023","journal-title":"LLaMA: open and efficient foundation language models"},{"key":"ref30","article-title":"Understanding and improving layer normalization","volume-title":"Proc 33rd Int Conf on Neural Information Processing Systems","author":"Xu","year":"2019"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.14778\/3611540.3611569"}],"container-title":["Frontiers of Information Technology &amp; Electronic Engineering"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.2400602.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1631\/FITEE.2400602\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1631\/FITEE.2400602.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,21]],"date-time":"2026-02-21T06:38:28Z","timestamp":1771655908000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1631\/FITEE.2400602"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,4,2]]},"references-count":31,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2025,5]]}},"alternative-id":["602"],"URL":"https:\/\/doi.org\/10.1631\/fitee.2400602","relation":{},"ISSN":["2095-9184","2095-9230"],"issn-type":[{"value":"2095-9184","type":"print"},{"value":"2095-9230","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,4,2]]},"assertion":[{"value":"17 July 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"23 February 2025","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 April 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"Dongsheng LI is a corresponding expert of\n                      Frontiers of Information Technology & Electronic Engineering\n                      , and he was not involved with the peer review process of this paper. All the authors declare that they have no conflict of interest.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of interest"}}]}}