{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T07:03:01Z","timestamp":1763190181062,"version":"3.45.0"},"publisher-location":"Cham","reference-count":16,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783032104656","type":"print"},{"value":"9783032104663","type":"electronic"}],"license":[{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,11,16]],"date-time":"2025-11-16T00:00:00Z","timestamp":1763251200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2026]]},"DOI":"10.1007\/978-3-032-10466-3_22","type":"book-chapter","created":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T06:58:57Z","timestamp":1763189937000},"page":"266-277","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["HBD-CE: Efficient Cross-HBD Communication for\u00a0LLM Training in\u00a0High-Bandwidth Domain Cluster via\u00a0Hierarchical Collectives"],"prefix":"10.1007","author":[{"given":"Huihuang","family":"Qin","sequence":"first","affiliation":[]},{"given":"Shuangwu","family":"Chen","sequence":"additional","affiliation":[]},{"given":"Zijian","family":"Wen","sequence":"additional","affiliation":[]},{"given":"Zian","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Ziyang","family":"Zou","sequence":"additional","affiliation":[]},{"given":"Tao","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Xiaobin","family":"Tan","sequence":"additional","affiliation":[]},{"given":"Jian","family":"Yang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,11,16]]},"reference":[{"key":"22_CR1","doi-asserted-by":"crossref","unstructured":"Cho, M., Finkler, U., Kung, D., Hunter, H.: BlueConnect: decomposing all-reduce for deep learning on heterogeneous network hierarchy. In: Proceedings of Machine Learning and Systems, vol. 1, pp. 241\u2013251 (2019)","DOI":"10.1147\/JRD.2019.2947013"},{"key":"22_CR2","unstructured":"Feng, Y., et al.: RailX: a flexible, scalable, and low-cost network architecture for hyper-scale LLM training systems. arXiv preprint arXiv:2507.18889 (2025)"},{"key":"22_CR3","unstructured":"Jiang, Z., et\u00a0al.: $$\\{$$MegaScale$$\\}$$: scaling large language model training to more than 10,000 $$\\{$$GPUs$$\\}$$. In: 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24), pp. 745\u2013760 (2024)"},{"key":"22_CR4","doi-asserted-by":"crossref","unstructured":"Khorassani, K.S., Chu, C.H., Anthony, Q.G., Subramoni, H., Panda, D.K.: Adaptive and hierarchical large message all-to-all communication algorithms for large-scale dense GPU systems. In: 2021 IEEE\/ACM 21st International Symposium on Cluster, Cloud and Internet Computing (CCGrid), pp. 113\u2013122. IEEE (2021)","DOI":"10.1109\/CCGrid51090.2021.00021"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Lee, J., Hwang, I., Shah, S., Cho, M.: FlexReduce: flexible all-reduce for distributed deep learning on asymmetric network topology. In: 2020 57th ACM\/IEEE Design Automation Conference (DAC), pp.\u00a01\u20136. IEEE (2020)","DOI":"10.1109\/DAC18072.2020.9218538"},{"key":"22_CR6","unstructured":"Mikami, H., Suganuma, H., Tanaka, Y., Kageyama, Y., et\u00a0al.: Massively distributed SGD: ImageNet\/ResNet-50 training in a flash. arXiv preprint arXiv:1811.05233 (2018)"},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Narayanan, D., Shoeybi, M., Casper, J., et\u00a0al.: Efficient large-scale language model training on GPU clusters using megatron-LM. In: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 1\u201315 (2021)","DOI":"10.1145\/3458817.3476209"},{"key":"22_CR8","unstructured":"NVIDIA: Nvidia DGX superpod (2024). https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-superpod"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Pan, X., Lin, W., Shi, S., Chu, X., Sun, W., Li, B.: Parm: efficient training of large sparsely-activated models with dedicated schedules. In: IEEE INFOCOM 2024-IEEE Conference on Computer Communications, pp. 1880\u20131889. IEEE (2024)","DOI":"10.1109\/INFOCOM52122.2024.10621327"},{"key":"22_CR10","unstructured":"Rajbhandari, S., et al.: DeepSpeed-MoE: advancing mixture-of-experts inference and training to power next-generation AI scale. In: International Conference on Machine Learning, pp. 18332\u201318346. PMLR (2022)"},{"key":"22_CR11","doi-asserted-by":"crossref","unstructured":"Shou, C., et\u00a0al.: InfiniteHBD: building datacenter-scale high-bandwidth domain for LLM with optical circuit switching transceivers. arXiv preprint arXiv:2502.03885 (2025)","DOI":"10.1145\/3718958.3750468"},{"key":"22_CR12","unstructured":"Smith, S., et\u00a0al.: Using deepspeed and megatron to train megatron-turing NLG 530b, a large-scale generative language model. arXiv preprint arXiv:2201.11990 (2022)"},{"key":"22_CR13","unstructured":"Um, T., et al.: Metis: fast automatic distributed training on heterogeneous $$\\{$$GPUs$$\\}$$. In: 2024 USENIX Annual Technical Conference (USENIX ATC 24), pp. 563\u2013578 (2024)"},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Wang, W., Ghobadi, M., Shakeri, K., Zhang, Y., Hasani, N.: Rail-only: a low-cost high-performance network for training LLMs with trillion parameters. In: 2024 IEEE Symposium on High-Performance Interconnects (HOTI), pp. 1\u201310. IEEE (2024)","DOI":"10.1109\/HOTI63208.2024.00013"},{"key":"22_CR15","unstructured":"Wang, X., Li, Q., et\u00a0al.: $$\\{$$SimAI$$\\}$$: unifying architecture design and performance tuning for $$\\{$$Large-Scale$$\\}$$ large language model training with scalability and precision. In: 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25), pp. 541\u2013558 (2025)"},{"key":"22_CR16","unstructured":"Zuo, P., et\u00a0al.: Serving large language models on Huawei cloudmatrix384. arXiv preprint arXiv:2506.12708 (2025)"}],"container-title":["Lecture Notes in Computer Science","Network and Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-032-10466-3_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,11,15]],"date-time":"2025-11-15T06:59:02Z","timestamp":1763189942000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-032-10466-3_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,16]]},"ISBN":["9783032104656","9783032104663"],"references-count":16,"URL":"https:\/\/doi.org\/10.1007\/978-3-032-10466-3_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,11,16]]},"assertion":[{"value":"16 November 2025","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NPC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"IFIP International Conference on Network and Parallel Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Nha Trang","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Vietnam","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2025","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 November 2025","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 November 2025","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"21","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"npc2025","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.npc-conference.com\/#\/npc2025","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}