{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T10:36:29Z","timestamp":1763202989166,"version":"build-2065373602"},"reference-count":29,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,9,2]],"date-time":"2025-09-02T00:00:00Z","timestamp":1756771200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,9,2]]},"DOI":"10.1109\/cluster59342.2025.11186487","type":"proceedings-article","created":{"date-parts":[[2025,10,7]],"date-time":"2025-10-07T17:35:09Z","timestamp":1759858509000},"page":"1-12","source":"Crossref","is-referenced-by-count":1,"title":["BMPipe: Bubble-Memory Co-Optimization Strategy Planner for Very-Large DNN Training"],"prefix":"10.1109","author":[{"given":"Ruiwen","family":"Wang","sequence":"first","affiliation":[{"name":"Sorbonne University,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chong","family":"Li","sequence":"additional","affiliation":[{"name":"Huawei Technologies France SASU,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Thibaut","family":"Tachon","sequence":"additional","affiliation":[{"name":"Huawei Technologies France SASU,Paris,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Raja","family":"Appuswamy","sequence":"additional","affiliation":[{"name":"EURECOM,Biot,France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Teng","family":"Su","sequence":"additional","affiliation":[{"name":"Huawei Technologies Co., Ltd.,Hangzhou,China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"volume-title":"Llama: Open and efficient foundation language models","year":"2023","author":"Touvron","key":"ref1"},{"volume-title":"Qwen2.5 technical report","year":"2025","author":"Qwen","key":"ref2"},{"key":"ref3","article-title":"Pangu- $\\alpha$: Large-scale autoregressive pretrained chinese language models with auto-parallel computation","volume":"abs\/2104.12369","author":"Zeng","year":"2021","journal-title":"CoRR"},{"volume-title":"BLOOM: A 176B-Parameter OpenAccess Multilingual Language Model","year":"2023","author":"Scao","key":"ref4"},{"key":"ref5","article-title":"An image is worth $16 \\times 16$ words: Transformers for image recognition at scale","volume":"abs\/2010.11929, 2020","author":"Dosovitskiy","journal-title":"CoRR"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/iccv48922.2021.00986"},{"volume-title":"Blip-2: Bootstrapping language-image pretraining with frozen image encoders and large language models","year":"2023","author":"Li","key":"ref7"},{"key":"ref8","first-page":"3489234916","article-title":"Visual instruction tuning","volume":"36","author":"Liu","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"ref9","article-title":"BERT: pre-training of deep bidirectional transformers for language understanding","volume":"abs\/1810.04805","author":"Devlin","year":"2018","journal-title":"CoRR"},{"volume-title":"Using deepspeed and megatron to train megatron-turing nlg 530b, a large-scale generative language model","year":"2022","author":"Smith","key":"ref10"},{"issue":"240","key":"ref11","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303957"},{"key":"ref13","article-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism","volume":"abs\/1909.08053","author":"Shoeybi","year":"2019","journal-title":"CoRR"},{"key":"ref14","article-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems","author":"Huang","year":"2019"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"ref17","first-page":"341","article-title":"Reducing activation recomputation in large transformer models","volume-title":"Proceedings of Machine Learning and Systems","volume":"5","author":"Korthikanti","year":"2023"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/tpds.2023.3247001"},{"key":"ref19","first-page":"16639","article-title":"BPipe: Memory-balanced pipeline parallelism for training large language models","volume-title":"Proceedings of the 40th International Conference on Machine Learning","volume":"202","author":"Kim"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607073"},{"key":"ref22","article-title":"Zero bubble (almost) pipeline parallelism","volume-title":"The Twelfth International Conference on Learning Representations","author":"Qi","year":"2024"},{"volume-title":"Deepseek-v3 technical report","year":"2025","key":"ref23"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651359"},{"volume-title":"Gurobi Optimization, LLC, Gurobi Optimizer Reference Manual","year":"2024","key":"ref26"},{"volume-title":"Atlas Ascned-NPU Clusters","key":"ref27"},{"volume-title":"Ascend\u2019s heterogeneous Compute Architecture for Neural Networks (CANN)","key":"ref28"},{"volume-title":"MindSpore source code","key":"ref29"}],"event":{"name":"2025 IEEE International Conference on Cluster Computing (CLUSTER)","start":{"date-parts":[[2025,9,2]]},"location":"United Kingdom","end":{"date-parts":[[2025,9,5]]}},"container-title":["2025 IEEE International Conference on Cluster Computing (CLUSTER)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11186399\/11186452\/11186487.pdf?arnumber=11186487","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,10,9]],"date-time":"2025-10-09T17:51:00Z","timestamp":1760032260000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11186487\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,9,2]]},"references-count":29,"URL":"https:\/\/doi.org\/10.1109\/cluster59342.2025.11186487","relation":{},"subject":[],"published":{"date-parts":[[2025,9,2]]}}}