{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T11:40:10Z","timestamp":1767958810779,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":18,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,1,26]]},"DOI":"10.1145\/3773656.3773667","type":"proceedings-article","created":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:22:11Z","timestamp":1767954131000},"page":"308-319","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["PRISM: Profiling-Free Symbolic Memory-Driven Strategy Planner for Large DNN Model Training"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-6709-5970","authenticated-orcid":false,"given":"Ruiwen","family":"Wang","sequence":"first","affiliation":[{"name":"Sorbonne University, Paris, France and Distributed and Parallel Technologies Laboratory, Huawei Technologies, Boulogne-Billancourt, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6981-5355","authenticated-orcid":false,"given":"Philippe","family":"Fang","sequence":"additional","affiliation":[{"name":"Distributed and Parallel Technologies Laboratory, Huawei Technologies, Boulogne-Billancourt, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4160-7170","authenticated-orcid":false,"given":"Chong","family":"Li","sequence":"additional","affiliation":[{"name":"Distributed and Parallel Technologies Laboratory, Huawei Technologies, Boulogne-Billancourt, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3264-5535","authenticated-orcid":false,"given":"Thibaut","family":"Tachon","sequence":"additional","affiliation":[{"name":"Distributed and Parallel Technologies Laboratory, Huawei Technologies, Boulogne-Billancourt, France"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5887-4091","authenticated-orcid":false,"given":"Raja","family":"Appuswamy","sequence":"additional","affiliation":[{"name":"Institut Eur\u00e9com, Biot, France"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,25]]},"reference":[{"key":"e_1_3_3_1_2_2","unstructured":"2025. Huawei Collective Communication Library (HCCL). https:\/\/gitee.com\/ascend\/cann-hccl. Accessed: 2025-09-20."},{"key":"e_1_3_3_1_3_2","unstructured":"2025. MindSpore: An Open Source Deep Learning Training\/Inference Framework. https:\/\/gitee.com\/mindspore\/mindspore. Accessed: 2025-09-20."},{"key":"e_1_3_3_1_4_2","doi-asserted-by":"crossref","unstructured":"Joshua Ainslie James Lee-Thorp Michiel de Jong Yury Zemlyanskiy Federico Lebr\u00f3n and Sumit Sanghai. 2023. GQA: Training Generalized Multi-Query Transformer Models from Multi-Head Checkpoints. arxiv:https:\/\/arXiv.org\/abs\/2305.13245\u00a0[cs.CL] https:\/\/arxiv.org\/abs\/2305.13245","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"e_1_3_3_1_5_2","unstructured":"Jehyeon Bang Yujeong Choi Myeongwoo Kim Yongdeok Kim and Minsoo Rhu. 2024. vTrain: A Simulation Framework for Evaluating Cost-effective and Compute-optimal Large Language Model Training. arxiv:https:\/\/arXiv.org\/abs\/2312.12391\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/2312.12391"},{"key":"e_1_3_3_1_6_2","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. CoRR abs\/1604.06174 (2016). arXiv:https:\/\/arXiv.org\/abs\/1604.06174http:\/\/arxiv.org\/abs\/1604.06174"},{"key":"e_1_3_3_1_7_2","volume-title":"Advances in Neural Information Processing Systems","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc'\u00a0aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, Quoc Le, and Andrew Ng. 2012. Large Scale Distributed Deep Networks. In Advances in Neural Information Processing Systems , F.\u00a0Pereira, C.J. Burges, L.\u00a0Bottou, and K.Q. Weinberger (Eds.), Vol.\u00a025. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2012\/file\/6aca97005c68f1206823815f66102863-Paper.pdf"},{"key":"e_1_3_3_1_8_2","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2022. Switch transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res. 23 1 Article 120 (Jan. 2022) 39\u00a0pages."},{"key":"e_1_3_3_1_9_2","volume-title":"GPipe: efficient training of giant neural networks using pipeline parallelism","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_3_1_10_2","doi-asserted-by":"publisher","DOI":"10.1145\/3662158.3662806"},{"key":"e_1_3_3_1_11_2","volume-title":"MLSys","author":"Korthikanti Vijay\u00a0Anand","year":"2023","unstructured":"Vijay\u00a0Anand Korthikanti, Jared Casper, Sangkug Lym, Lawrence McAfee, Michael Andersch, Mohammad Shoeybi, and Bryan Catanzaro. 2023. Reducing Activation Recomputation in Large Transformer Models. In MLSys. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2023\/hash\/80083951326cf5b35e5100260d64ed81-Abstract-mlsys2023.html"},{"key":"e_1_3_3_1_12_2","volume-title":"International Conference on Learning Representations","author":"Lepikhin Dmitry","year":"2021","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2021. {GS}hard: Scaling Giant Models with Conditional Computation and Automatic Sharding. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=qrwe7XHTmYb"},{"key":"e_1_3_3_1_13_2","doi-asserted-by":"publisher","unstructured":"Xupeng Miao Yujie Wang Youhe Jiang Chunan Shi Xiaonan Nie Hailin Zhang and Bin Cui. 2022. Galvatron: Efficient Transformer Training over Multiple GPUs Using Automatic Parallelism. Proc. VLDB Endow. 16 3 (Nov. 2022) 470\u2013479. 10.14778\/3570690.3570697","DOI":"10.14778\/3570690.3570697"},{"key":"e_1_3_3_1_14_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_3_1_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_3_1_16_2","series-title":"Proceedings of Machine Learning Research","first-page":"18332","volume-title":"Proceedings of the 39th International Conference on Machine Learning","volume":"162","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza\u00a0Yazdani Aminabadi, Ammar\u00a0Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In Proceedings of the 39th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0162), Kamalika Chaudhuri, Stefanie Jegelka, Le\u00a0Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 18332\u201318346. https:\/\/proceedings.mlr.press\/v162\/rajbhandari22a.html"},{"key":"e_1_3_3_1_17_2","series-title":"(SC \u201920)","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: memory optimizations toward training trillion parameter models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 20, 16\u00a0pages."},{"key":"e_1_3_3_1_18_2","unstructured":"Noam Shazeer. 2019. Fast Transformer Decoding: One Write-Head is All You Need. arxiv:https:\/\/arXiv.org\/abs\/1911.02150\u00a0[cs.NE] https:\/\/arxiv.org\/abs\/1911.02150"},{"key":"e_1_3_3_1_19_2","unstructured":"Mohammad Shoeybi Mostofa Patwary Raul Puri Patrick LeGresley Jared Casper and Bryan Catanzaro. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. CoRR abs\/1909.08053 (2019). arXiv:https:\/\/arXiv.org\/abs\/1909.08053http:\/\/arxiv.org\/abs\/1909.08053"}],"event":{"name":"SCA\/HPCAsia 2026: Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region","location":"Osaka Japan","acronym":"SCA\/HPCAsia 2026"},"container-title":["Proceedings of the Supercomputing Asia and International Conference on High Performance Computing in Asia Pacific Region"],"original-title":[],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T10:23:22Z","timestamp":1767954202000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773656.3773667"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,1,25]]},"references-count":18,"alternative-id":["10.1145\/3773656.3773667","10.1145\/3773656"],"URL":"https:\/\/doi.org\/10.1145\/3773656.3773667","relation":{},"subject":[],"published":{"date-parts":[[2026,1,25]]},"assertion":[{"value":"2026-01-25","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}