{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:17:37Z","timestamp":1773317857310,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100012166","name":"National Key Research and Development Program of China","doi-asserted-by":"publisher","award":["2024YFB4505603"],"award-info":[{"award-number":["2024YFB4505603"]}],"id":[{"id":"10.13039\/501100012166","id-type":"DOI","asserted-by":"publisher"}]},{"name":"the National Natural Science Foundation of China","award":["No.62192784"],"award-info":[{"award-number":["No.62192784"]}]},{"name":"the high-quality development project of MIIT and the Institute Guo Qiang at Tsinghua University","award":[""],"award-info":[{"award-number":[""]}]},{"name":"Young Elite Scientists Sponsorship Program by CAST","award":["No.2023QNRC001"],"award-info":[{"award-number":["No.2023QNRC001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759802","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"1429-1445","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["BurstEngine: An efficient distributed framework for training transformers On extremely Long sequences of over 1M tokens"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-3631-1233","authenticated-orcid":false,"given":"Ao","family":"Sun","sequence":"first","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8016-1952","authenticated-orcid":false,"given":"Weilin","family":"Zhao","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4726-7621","authenticated-orcid":false,"given":"Xu","family":"Han","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7821-0030","authenticated-orcid":false,"given":"Cheng","family":"Yang","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7709-2543","authenticated-orcid":false,"given":"Zhiyuan","family":"Liu","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3734-0266","authenticated-orcid":false,"given":"Chuan","family":"Shi","sequence":"additional","affiliation":[{"name":"Beijing University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6011-6115","authenticated-orcid":false,"given":"Maosong","family":"Sun","sequence":"additional","affiliation":[{"name":"Tsinghua University, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"Marah Abdin Jyoti Aneja Hany Awadalla Ahmed Awadallah Ammar\u00a0Ahmad Awan Nguyen Bach Amit Bahree Arash Bakhtiari Jianmin Bao Harkirat Behl et\u00a0al. 2024. Phi-3 technical report: A highly capable language model locally on your phone. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2404.14219 (2024)."},{"key":"e_1_3_3_2_3_2","unstructured":"William Brandon Aniruddha Nrusimha Kevin Qian Zachary Ankner Tian Jin Zhiye Song and Jonathan Ragan-Kelley. 2023. Striped attention: Faster ring attention for causal transformers. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2311.09431 (2023)."},{"key":"e_1_3_3_2_4_2","first-page":"1877","volume-title":"Proceedings of NeurIPS","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et\u00a0al. 2020. Language models are few-shot learners. In Proceedings of NeurIPS. 1877\u20131901."},{"key":"e_1_3_3_2_5_2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training deep nets with sublinear memory cost. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1604.06174 (2016)."},{"key":"e_1_3_3_2_6_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.02283"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.52202\/075280-2142"},{"key":"e_1_3_3_2_8_2","unstructured":"Tri Dao. 2023. Flashattention-2: Faster attention with better parallelism and work partitioning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.08691 (2023)."},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.52202\/068431-1189"},{"key":"e_1_3_3_2_10_2","unstructured":"Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur Alan Schelten Amy Yang Angela Fan et\u00a0al. 2024. The llama 3 herd of models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.21783 (2024)."},{"key":"e_1_3_3_2_11_2","unstructured":"Jiarui Fang and Shangchun Zhao. 2024. A Unified Sequence Parallelism Approach for Long Context Generative AI. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2405.07719 (2024)."},{"key":"e_1_3_3_2_12_2","unstructured":"Trevor Gale Deepak Narayanan Cliff Young and Matei Zaharia. 2023. Megablocks: Efficient sparse training with mixture-of-experts. Proceedings of Machine Learning and Systems 5 (2023) 288\u2013304."},{"key":"e_1_3_3_2_13_2","unstructured":"Scott Gray Alec Radford and Diederik\u00a0P Kingma. 2017. Gpu kernels for block-sparse weights. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1711.09224 3 2 (2017) 2."},{"key":"e_1_3_3_2_14_2","unstructured":"Diandian Gu Peng Sun Qinghao Hu Ting Huang Xun Chen Yingtong Xiong Guoteng Wang Qiaoling Chen Shangchun Zhao Jiarui Fang et\u00a0al. 2024. Loongtrain: Efficient training of long-sequence llms with head-context parallelism. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2406.18485 (2024)."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"crossref","unstructured":"Kai Han Yunhe Wang Hanting Chen Xinghao Chen Jianyuan Guo Zhenhua Liu Yehui Tang An Xiao Chunjing Xu Yixing Xu et\u00a0al. 2022. A survey on vision transformer. IEEE transactions on pattern analysis and machine intelligence 45 1 (2022) 87\u2013110.","DOI":"10.1109\/TPAMI.2022.3152247"},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"crossref","unstructured":"Xu Han Zhengyan Zhang Ning Ding Yuxian Gu Xiao Liu Yuqi Huo Jiezhong Qiu Yuan Yao Ao Zhang Liang Zhang et\u00a0al. 2021. Pre-trained models: Past present and future. AI Open 2 (2021) 225\u2013250.","DOI":"10.1016\/j.aiopen.2021.08.002"},{"key":"e_1_3_3_2_17_2","first-page":"103","volume-title":"Proceedings of NuerIPS","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V Le, Yonghui Wu, et\u00a0al. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. In Proceedings of NuerIPS. 103\u2013112."},{"key":"e_1_3_3_2_18_2","unstructured":"Binyuan Hui Jian Yang Zeyu Cui Jiaxi Yang Dayiheng Liu Lei Zhang Tianyu Liu Jiajun Zhang Bowen Yu Keming Lu et\u00a0al. 2024. Qwen2.5-coder technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2409.12186 (2024)."},{"key":"e_1_3_3_2_19_2","unstructured":"Sam\u00a0Ade Jacobs Masahiro Tanaka Chengming Zhang Minjia Zhang Shuaiwen\u00a0Leon Song Samyam Rajbhandari and Yuxiong He. 2023. Deepspeed ulysses: System optimizations for enabling training of extreme long sequence transformer models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2309.14509 (2023)."},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Salman Khan Muzammal Naseer Munawar Hayat Syed\u00a0Waqas Zamir Fahad\u00a0Shahbaz Khan and Mubarak Shah. 2022. Transformers in vision: A survey. ACM computing surveys (CSUR) 54 10s (2022) 1\u201341.","DOI":"10.1145\/3505244"},{"key":"e_1_3_3_2_21_2","volume-title":"Proceedings of MLSYS","author":"Korthikanti Vijay\u00a0Anand","year":"2023","unstructured":"Vijay\u00a0Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing activation recomputation in large transformer models. In Proceedings of MLSYS."},{"key":"e_1_3_3_2_22_2","volume-title":"First Conference on Language Modeling","author":"Li Dacheng","year":"2024","unstructured":"Dacheng Li, Rulin Shao, Anze Xie, Eric\u00a0P Xing, Xuezhe Ma, Ion Stoica, Joseph\u00a0E Gonzalez, and Hao Zhang. 2024. DISTFLASHATTN: Distributed Memory-efficient Attention for Long-context LLMs Training. In First Conference on Language Modeling."},{"key":"e_1_3_3_2_23_2","unstructured":"Shenggui Li Fuzhao Xue Chaitanya Baranwal Yongbin Li and Yang You. 2021. Sequence parallelism: Long sequence training from system perspective. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2105.13120 (2021)."},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"crossref","unstructured":"Tianyang Lin Yuxin Wang Xiangyang Liu and Xipeng Qiu. 2022. A survey of transformers. AI open 3 (2022) 111\u2013132.","DOI":"10.1016\/j.aiopen.2022.10.001"},{"key":"e_1_3_3_2_25_2","unstructured":"Hao Liu Matei Zaharia and Pieter Abbeel. 2023. Ring attention with blockwise transformers for near-infinite context. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.01889 (2023)."},{"key":"e_1_3_3_2_26_2","unstructured":"Cheng Luo Jiawei Zhao Zhuoming Chen Beidi Chen and Anima Anandkumar. 2024. Mini-Sequence Transformer: Optimizing Intermediate Memory for Long Sequences Training. (9 2024). [Online; accessed 2024-12-26]."},{"key":"e_1_3_3_2_27_2","unstructured":"Maxim Milakov and Natalia Gimelshein. 2018. Online normalizer calculation for softmax. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1805.02867 (2018)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_2_29_2","unstructured":"NVIDIA. 2023. TransformerEngine. https:\/\/github.com\/NVIDIA\/TransformerEngine\/blob\/main\/transformer_engine\/pytorch\/attention.py#L1644."},{"key":"e_1_3_3_2_30_2","unstructured":"OpenBMB. 2023. BMTrain: Efficient Training for Big Models. https:\/\/github.com\/OpenBMB\/BMTrain."},{"key":"e_1_3_3_2_31_2","unstructured":"Markus\u00a0N Rabe and Charles Staats. 2021. Self-attention Does Not Need O(n2) Memory. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2112.05682 (2021)."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_3_2_33_2","first-page":"551","volume-title":"Proceedings of ATC","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza\u00a0Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. In Proceedings of ATC. 551\u2013564."},{"key":"e_1_3_3_2_34_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. LLaMA: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_35_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. LLaMA 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Leslie\u00a0G Valiant. 1990. A bridging model for parallel computation. Commun. ACM (1990) 103\u2013111.","DOI":"10.1145\/79173.79181"},{"key":"e_1_3_3_2_37_2","volume-title":"Proceedings of NeurIPS","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Proceedings of NeurIPS."},{"key":"e_1_3_3_2_38_2","unstructured":"Sinong Wang Belinda\u00a0Z Li Madian Khabsa Han Fang and Hao Ma. 2020. Linformer: Self-attention with linear complexity. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2006.04768 (2020)."},{"key":"e_1_3_3_2_39_2","unstructured":"Yuzhong Wang Xu Han Weilin Zhao Guoyang Zeng Zhiyuan Liu and Maosong Sun. 2024. H3T: Efficient Integration of Memory Optimization and Parallelism for Large-scale Transformer Training. Advances in Neural Information Processing Systems 36 (2024)."},{"key":"e_1_3_3_2_40_2","volume-title":"Cut Your Losses in Large-Vocabulary Language Models","author":"Wijmans Erik","unstructured":"Erik Wijmans, Brody Huval, Alexander Hertzberg, Vladlen Koltun, and Philipp Kr\u00e4henb\u00fchl. [n. d.]. Cut Your Losses in Large-Vocabulary Language Models. arXiv:https:\/\/arXiv.org\/abs\/2411.09009\u00a0[cs] http:\/\/arxiv.org\/abs\/2411.09009"},{"key":"e_1_3_3_2_41_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP40776.2020.9053878"},{"key":"e_1_3_3_2_42_2","unstructured":"An Yang Baosong Yang Binyuan Hui Bo Zheng Bowen Yu Chang Zhou Chengpeng Li Chengyuan Li Dayiheng Liu Fei Huang et\u00a0al. 2024. Qwen2 technical report. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2407.10671 (2024)."},{"key":"e_1_3_3_2_43_2","doi-asserted-by":"crossref","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. https:\/\/doi.org\/10.14778\/3611540.3611569. Proceedings of the VLDB Endowment 16 12 (8 2023) 3848\u20133860. [Online; accessed 2024-12-18].","DOI":"10.14778\/3611540.3611569"},{"key":"e_1_3_3_2_44_2","volume-title":"The Twelfth International Conference on Learning Representations","author":"Zhu Deyao","year":"2024","unstructured":"Deyao Zhu, Jun Chen, Xiaoqian Shen, Xiang Li, and Mohamed Elhoseiny. 2024. MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models. In The Twelfth International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=1tZbq88f27"},{"key":"e_1_3_3_2_45_2","unstructured":"Zhuzilin. 2023. Ring-Flash-Attention. https:\/\/github.com\/zhuzilin\/ring-flash-attention.git."}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759802","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:28:43Z","timestamp":1773253723000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759802"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":44,"alternative-id":["10.1145\/3712285.3759802","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759802","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}