{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,30]],"date-time":"2025-08-30T05:40:11Z","timestamp":1756532411775,"version":"3.44.0"},"reference-count":40,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T00:00:00Z","timestamp":1754265600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T00:00:00Z","timestamp":1754265600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,8,4]]},"DOI":"10.1109\/icccn65249.2025.11133974","type":"proceedings-article","created":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T17:39:20Z","timestamp":1756489160000},"page":"1-6","source":"Crossref","is-referenced-by-count":0,"title":["TOPO-X: Co-optimize Flow Scheduling, Topology, and ML Training Parallelism"],"prefix":"10.1109","author":[{"given":"Yi-Xiang","family":"Hu","sequence":"first","affiliation":[{"name":"University of Science and Technology of China,School of Computer Science and Technology,Hefei,China"}]},{"given":"Han","family":"Tian","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Computer Science and Technology,Hefei,China"}]},{"given":"Yifang","family":"Zhao","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Cyber Science and Technology,Hefei,China"}]},{"given":"Feng","family":"Wu","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Computer Science and Technology,Hefei,China"}]},{"given":"Xiang-Yang","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China,School of Computer Science and Technology,Hefei,China"}]}],"member":"263","reference":[{"article-title":"Chatgpt: Transforming text generation with deep learning","volume-title":"OpenAI Blog","year":"2022","key":"ref1"},{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"Radford","key":"ref2"},{"article-title":"Language models are unsupervised multitask learners","year":"2019","author":"Radford","key":"ref3"},{"key":"ref4","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in neural information processing systems"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/1402946.1402967"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/2534169.2486031"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1879141.1879175"},{"key":"ref9","first-page":"249","article-title":"Network requirements for resource disaggregation","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Gao"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.48550\/arxiv.1811.06965"},{"key":"ref12","article-title":"Beyond data and model parallelism for deep neural networks","volume":"abs\/1807.05358","author":"Jia","year":"2018"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"ref14","first-page":"739","article-title":"TopoOpt: Co-optimizing network topology and parallelization strategy for distributed training jobs","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Wang"},{"key":"ref15","article-title":"Accelerating persistent neural networks at datacenter scale","volume":"29","author":"Chung","year":"2017","journal-title":"Hot Chips"},{"article-title":"Horovod: fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref16"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/3419394.3423637"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3620666.3651335"},{"key":"ref20","first-page":"2430","article-title":"Device placement optimization with reinforcement learning","volume-title":"Proceedings of the 34th International Conference on Machine Learning","volume":"70","author":"Mirhoseini"},{"key":"ref21","first-page":"1403","article-title":"CASSINI: Network-Aware job scheduling in machine learning clusters","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI 24)","author":"Rajasekaran"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"year":"2022","key":"ref23","article-title":"Deep learning recommendation model for personalization and recommendation systems"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851223"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934911"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"ref27","article-title":"Better deep learning: train faster, reduce overfitting, and make better predictions","author":"Brownlee","year":"2018","journal-title":"Machine learning mastery"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2003.815843"},{"key":"ref29","first-page":"235","article-title":"Understanding lifecycle management complexity of datacenter topologies","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Zhang"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TITS.2023.3308594"},{"year":"2022","key":"ref31","article-title":"Polatis optical circuit switch"},{"key":"ref32","first-page":"2274","article-title":"Exploring hidden dimensions in accelerating convolutional neural networks","volume-title":"International Conference on Machine Learning","author":"Jia"},{"key":"ref33","article-title":"On model parallelization and scheduling strategies for distributed machine learning","volume":"27","author":"Lee","year":"2014","journal-title":"Advances in neural information processing systems"},{"article-title":"Improving language understanding by generative pre-training","year":"2018","author":"Radford","key":"ref34"},{"year":"2024","key":"ref35","article-title":"Gurobi Optimizer Reference Manual"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589350"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507726"},{"article-title":"Mxnet: A flexible and efficient machine learning library for heterogeneous distributed systems","year":"2015","author":"Chen","key":"ref38"},{"key":"ref39","first-page":"289","article-title":"Themis: Fair and efficient GPU cluster scheduling","volume-title":"17th USENIX Symposium on Networked Systems Design and Implementation (NSDI 20)","author":"Mahajan"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1145\/3651890.3672249"}],"event":{"name":"2025 34th International Conference on Computer Communications and Networks (ICCCN)","start":{"date-parts":[[2025,8,4]]},"location":"Tokyo, Japan","end":{"date-parts":[[2025,8,7]]}},"container-title":["2025 34th International Conference on Computer Communications and Networks (ICCCN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11133715\/11133717\/11133974.pdf?arnumber=11133974","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,30]],"date-time":"2025-08-30T05:14:28Z","timestamp":1756530868000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11133974\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,4]]},"references-count":40,"URL":"https:\/\/doi.org\/10.1109\/icccn65249.2025.11133974","relation":{},"subject":[],"published":{"date-parts":[[2025,8,4]]}}}