{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T01:13:25Z","timestamp":1778721205975,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","funder":[{"name":"Zhejiang Provincial Natural Science Foundation of China","award":["LQN26F020008"],"award-info":[{"award-number":["LQN26F020008"]}]},{"name":"CAAI-Ant Group Research Fund","award":["2025CAAI-ANT-15"],"award-info":[{"award-number":["2025CAAI-ANT-15"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,4,13]]},"DOI":"10.1145\/3774904.3792254","type":"proceedings-article","created":{"date-parts":[[2026,4,27]],"date-time":"2026-04-27T13:28:36Z","timestamp":1777296516000},"page":"5189-5197","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["HeteroSim: Towards High-Fidelity Heterogeneous LLM Training Simulation on GPUs"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-3106-586X","authenticated-orcid":false,"given":"Xiaofei","family":"Yue","sequence":"first","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China and Beijing Institute of Technology, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6782-4444","authenticated-orcid":false,"given":"Fangming","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5363-3163","authenticated-orcid":false,"given":"Fulun","family":"Ye","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2888-4499","authenticated-orcid":false,"given":"Jiongchi","family":"Yu","sequence":"additional","affiliation":[{"name":"Singapore Management University, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2195-0799","authenticated-orcid":false,"given":"Zhaoxuan","family":"Li","sequence":"additional","affiliation":[{"name":"Institute of Information Engineering, CAS, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6589-3706","authenticated-orcid":false,"given":"Tingting","family":"Li","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1455-4330","authenticated-orcid":false,"given":"Ziming","family":"Zhao","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4703-7348","authenticated-orcid":false,"given":"Jianwei","family":"Yin","sequence":"additional","affiliation":[{"name":"School of Software Technology, Zhejiang University, Ningbo, China"}]}],"member":"320","published-online":{"date-parts":[[2026,4,12]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Qsgd: Communication-efficient sgd via gradient quantization and encoding. Advances in neural information processing systems, 30","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. Qsgd: Communication-efficient sgd via gradient quantization and encoding. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_1_3_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877-1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877-1901, 2020."},{"key":"e_1_3_2_1_4_1","first-page":"578","volume-title":"OSDI","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, et al. TVM: An automated End-to-End optimizing compiler for deep learning. In OSDI, pages 578-594, 2018."},{"key":"e_1_3_2_1_5_1","first-page":"241","article-title":"Decomposing all-reduce for deep learning on heterogeneous network hierarchy","volume":"1","author":"Cho Minsik","year":"2019","unstructured":"Minsik Cho, Ulrich Finkler, David Kung, and Hillery Hunter. Blueconnect: Decomposing all-reduce for deep learning on heterogeneous network hierarchy. Proceedings of Machine Learning and Systems, 1:241-251, 2019.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_6_1","first-page":"225","volume-title":"ACM SIGCOMM","author":"Ghorbani Soudeh","year":"2017","unstructured":"Soudeh Ghorbani, Zibin Yang, P Brighten Godfrey, Yashar Ganjali, and Amin Firoozshahian. Drill: Micro load balancing for low-latency data center networks. In ACM SIGCOMM, pages 225-238, 2017."},{"key":"e_1_3_2_1_7_1","first-page":"51","volume-title":"ACM SIGCOMM","author":"Greenberg Albert","year":"2009","unstructured":"Albert Greenberg, James R Hamilton, et al. Vl2: A scalable and flexible data center network. In ACM SIGCOMM, pages 51-62, 2009."},{"key":"e_1_3_2_1_8_1","first-page":"485","volume-title":"USENIX NSDI","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, et al. Tiresias: A GPU cluster manager for distributed deep learning. In USENIX NSDI, pages 485-500, 2019."},{"key":"e_1_3_2_1_9_1","first-page":"473","volume-title":"USENIX NSDI","author":"Gui Fei","year":"2025","unstructured":"Fei Gui, Kaihui Gao, Li Chen, et al. Accelerating design space exploration for training systems with multi-experiment parallel simulation. In USENIX NSDI, pages 473-488, 2025."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787507"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8191(06)80021-9"},{"key":"e_1_3_2_1_12_1","first-page":"1","article-title":"Beyond data and model parallelism for deep neural networks","volume":"1","author":"Zhihao Jia","year":"2019","unstructured":"Zhihao Jia et al. Beyond data and model parallelism for deep neural networks. Proceedings of Machine Learning and Systems, 1:1-13, 2019.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_1_15_1","first-page":"44","volume-title":"ACM SIGCOMM","author":"Li Yuliang","year":"2019","unstructured":"Yuliang Li, Rui Miao, Hongqiang Harry Liu, Yan Zhuang, Fei Feng, et al. Hpcc: High precision congestion control. In ACM SIGCOMM, pages 44-58. 2019."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3389705"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787510"},{"key":"e_1_3_2_1_18_1","first-page":"1","volume-title":"ACM SOSP","author":"Narayanan Deepak","year":"2019","unstructured":"Deepak Narayanan, Aaron Harlap, Amar Phanishayee, Vivek Seshadri, et al. Pipedream: Generalized pipeline parallelism for dnn training. In ACM SOSP, pages 1-15, 2019."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_20_1","volume-title":"Doubling all2all performance with nvidia collective communication library 2.12. https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. Doubling all2all performance with nvidia collective communication library 2.12. https:\/\/developer.nvidia.com\/blog\/doubling-all2all-performance-with-nvidia-collective-communication-library-2-12\/, 2022."},{"key":"e_1_3_2_1_21_1","volume-title":"NVIDIA Collective Communication Library (NCCL) Documentation","author":"NVIDIA.","year":"2025","unstructured":"NVIDIA. NVIDIA Collective Communication Library (NCCL) Documentation, 2025. https:\/\/docs.nvidia.com\/deeplearning\/nccl."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2014.2299539"},{"key":"e_1_3_2_1_23_1","first-page":"95","volume-title":"Peer-to-Peer Computing","author":"Quinson Martin","year":"2009","unstructured":"Martin Quinson. Simgrid: a generic framework for large-scale distributed experiments. In Peer-to-Peer Computing, pages 95-96, 2009."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_25_1","first-page":"81","volume-title":"IEEE International Symposium on Performance Analysis of Systems and Software","author":"Saeed","year":"2020","unstructured":"Saeed Rashidi et al. Astra-sim: Enabling sw\/hw co-design exploration for distributed dl training platforms. In IEEE International Symposium on Performance Analysis of Systems and Software, pages 81-92, 2020."},{"key":"e_1_3_2_1_26_1","first-page":"225","volume-title":"NSDI","author":"Ankit","year":"2012","unstructured":"Ankit Singla et al. Jellyfish: Networking data centers randomly. In NSDI, pages 225-238. USENIX Association, 2012."},{"key":"e_1_3_2_1_27_1","first-page":"10118","volume-title":"International Conference on Machine Learning","author":"Tang Hanlin","year":"2021","unstructured":"Hanlin Tang, Shaoduo Gan, et al. 1-bit adam: Communication efficient large-scale training with adam's convergence speed. In International Conference on Machine Learning, pages 10118-10129, 2021."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_1_29_1","volume-title":"Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971","author":"Hugo Touvron","year":"2023","unstructured":"Hugo Touvron et al. Llama: Open and efficient foundation language models. arXiv preprint arXiv:2302.13971, 2023."},{"key":"e_1_3_2_1_30_1","first-page":"267","volume-title":"USENIX Symposium on Operating Systems Design and Implementation","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, et al. Unity: Accelerating DNN training through joint optimization of algebraic transformations and parallelization. In USENIX Symposium on Operating Systems Design and Implementation, pages 267-284, 2022."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.5555\/1416222.1416290"},{"key":"e_1_3_2_1_32_1","first-page":"541","volume-title":"USENIX NSDI","author":"Wang Xizheng","year":"2025","unstructured":"Xizheng Wang, Qingxu Li, Yichi Xu, et al. SimAI: Unifying architecture design and performance tuning for large-scale large language model training with scalability and precision. In USENIX NSDI, pages 541-558, 2025."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS57527.2023.00035"},{"key":"e_1_3_2_1_35_1","first-page":"595","volume-title":"USENIX OSDI","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, et al. Gandiva: Introspective cluster scheduling for deep learning. In USENIX OSDI, pages 595-610, 2018."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2025.3632089"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621303"},{"key":"e_1_3_2_1_38_1","first-page":"515","volume-title":"USENIX OSDI","author":"Zhao Hanyu","year":"2020","unstructured":"Hanyu Zhao, Zhenhua Han, et al. HiveD: Sharing a GPU cluster for deep learning with guarantees. In USENIX OSDI, pages 515-532, 2020."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589334.3645407"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM52122.2024.10621290"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2025.3560045"},{"key":"e_1_3_2_1_42_1","first-page":"559","volume-title":"USENIX OSDI","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, et al. Alpa: Automating inter-and intra-operator parallelism for distributed deep learning. In USENIX OSDI, pages 559-578, 2022."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787484"}],"event":{"name":"WWW '26: The ACM Web Conference 2026","location":"Dubai United Arab Emirates","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web"]},"container-title":["Proceedings of the ACM Web Conference 2026"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3774904.3792254","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,5,14]],"date-time":"2026-05-14T01:02:26Z","timestamp":1778720546000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774904.3792254"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,4,12]]},"references-count":43,"alternative-id":["10.1145\/3774904.3792254","10.1145\/3774904"],"URL":"https:\/\/doi.org\/10.1145\/3774904.3792254","relation":{},"subject":[],"published":{"date-parts":[[2026,4,12]]},"assertion":[{"value":"2026-04-12","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}