{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,30]],"date-time":"2025-08-30T05:40:10Z","timestamp":1756532410611,"version":"3.44.0"},"reference-count":38,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T00:00:00Z","timestamp":1754265600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,8,4]],"date-time":"2025-08-04T00:00:00Z","timestamp":1754265600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,8,4]]},"DOI":"10.1109\/icccn65249.2025.11133842","type":"proceedings-article","created":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T17:39:20Z","timestamp":1756489160000},"page":"1-9","source":"Crossref","is-referenced-by-count":0,"title":["Revisiting the Straggling Problem in GPU-based Distributed Deep Learning Training"],"prefix":"10.1109","author":[{"given":"Suraiya","family":"Tairin","sequence":"first","affiliation":[{"name":"University of Virginia,Department of Computer Science,Charlottesville,VA,22904"}]},{"given":"Zeyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Virginia,Department of Computer Science,Charlottesville,VA,22904"}]},{"given":"Haiying","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Virginia,Department of Computer Science,Charlottesville,VA,22904"}]}],"member":"263","reference":[{"article-title":"Flexgen: High-throughput generative inference of large language models with a single gpu","volume-title":"Proc. of ICML","author":"Sheng","key":"ref1"},{"key":"ref2","article-title":"Openai\u2019s ceo says the age of giant ai models is already over"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026899"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421299"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1145\/3419111.3421307"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00057"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00028"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987554"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378499"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3452773"},{"key":"ref11","first-page":"1145","article-title":"Asynchronous byzantine machine learning (the case of sgd)","volume-title":"Proc. of ICML","author":"Damaskinos"},{"article-title":"Effective straggler mitigation: Attack of the clones","volume-title":"Proc. of NSDI 13","author":"Ananthanarayanan","key":"ref12"},{"article-title":"Revisiting distributed synchronous sgd","year":"2016","author":"Chen","key":"ref13"},{"key":"ref14","first-page":"400","article-title":"Resource elasticity in distributed deep learning","volume-title":"Proc. of MLSys","volume":"2","author":"Or"},{"article-title":"Tictac: Accelerating distributed deep learning with communication scheduling","volume-title":"Proc. of MLSys","author":"Hashemi","key":"ref15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-36071-8_8"},{"article-title":"Heterogeneity-aware cluster scheduling policies for deep learning workloads","volume-title":"Proc. of OSDI","author":"Narayanan","key":"ref17"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488815"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304009"},{"article-title":"MLaaS in the wild: Workload analysis and scheduling in large-scale heterogeneous GPU clusters","volume-title":"Proc. of NSDI","author":"Weng","key":"ref20"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOMWKSHPS57453.2023.10226109"},{"key":"ref22","article-title":"Microsoft philly trace"},{"key":"ref23","article-title":"Bbc text categorization"},{"key":"ref24","article-title":"Wikitext dataset"},{"article-title":"Horovod: fast and easy distributed deep learning in tensorflow","year":"2018","author":"Sergeev","key":"ref25"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1145\/3127479.3127490"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/326619.326694"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/B978-1-55860-335-6.50043-X"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/2740070.2626334"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v36i8.20832"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613152"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.14778\/3598581.3598604"},{"article-title":"Large batch size training of neural networks with adversarial training and second-order information","year":"2018","author":"Yao","key":"ref34"},{"article-title":"Bamboo: Making preemptible instances resilient for affordable training of large {DNNs}","volume-title":"Proc. of NSDI","author":"Thorpe","key":"ref35"},{"article-title":"Zero++: Extremely efficient collective communication for giant model training","year":"2023","author":"Wang","key":"ref36"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2021.3131614"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1016\/j.ins.2021.06.039"}],"event":{"name":"2025 34th International Conference on Computer Communications and Networks (ICCCN)","start":{"date-parts":[[2025,8,4]]},"location":"Tokyo, Japan","end":{"date-parts":[[2025,8,7]]}},"container-title":["2025 34th International Conference on Computer Communications and Networks (ICCCN)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11133715\/11133717\/11133842.pdf?arnumber=11133842","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,30]],"date-time":"2025-08-30T05:14:23Z","timestamp":1756530863000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11133842\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,8,4]]},"references-count":38,"URL":"https:\/\/doi.org\/10.1109\/icccn65249.2025.11133842","relation":{},"subject":[],"published":{"date-parts":[[2025,8,4]]}}}