{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T23:19:41Z","timestamp":1768519181208,"version":"3.49.0"},"reference-count":50,"publisher":"IEEE","license":[{"start":{"date-parts":[[2025,12,14]],"date-time":"2025-12-14T00:00:00Z","timestamp":1765670400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2025,12,14]],"date-time":"2025-12-14T00:00:00Z","timestamp":1765670400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["2312785"],"award-info":[{"award-number":["2312785"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025,12,14]]},"DOI":"10.1109\/icpads67057.2025.11323035","type":"proceedings-article","created":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T20:36:54Z","timestamp":1768423014000},"page":"1-10","source":"Crossref","is-referenced-by-count":0,"title":["FEDeS: Fair, Efficient, and Reliable Multi-Tenant Deep Learning Training with Serverless Computing"],"prefix":"10.1109","author":[{"given":"Yeonhyeok","family":"Jeong","sequence":"first","affiliation":[{"name":"UNIST"}]},{"given":"Seungmin","family":"Lee","sequence":"additional","affiliation":[{"name":"UNIST"}]},{"given":"Seonghyeon","family":"Jue","sequence":"additional","affiliation":[{"name":"UNIST"}]},{"given":"Sam H.","family":"Noh","sequence":"additional","affiliation":[{"name":"Virginia Tech"}]},{"given":"Young-ri","family":"Choi","sequence":"additional","affiliation":[{"name":"UNIST"}]}],"member":"263","reference":[{"key":"ref4","article-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider","author":"Shahrad","journal-title":"ATC 20"},{"key":"ref5","article-title":"Analysis of Large-Scale Multi-Tenant GPU clusters for DNN training workloads","author":"Jeon","journal-title":"ATC 19"},{"key":"ref6","article-title":"Themis: Fair and efficient GPU cluster scheduling","author":"Mahajan","journal-title":"NSDI 20"},{"key":"ref7","article-title":"Tiresias: A GPU cluster manager for distributed deep learning","author":"Gu","journal-title":"NSDI 19"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/3357223.3362711"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737391"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/90.879343"},{"key":"ref11","article-title":"Generalized $\\alpha$-fair resource allocation in wireless networks","author":"Altman","journal-title":"CDC 08"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"ref14","article-title":"Horovod: fast and easy distributed deep learning in TensorFlow","author":"Sergeev","year":"2018","journal-title":"arXiv preprint arXiv"},{"key":"ref15","article-title":"Heterogeneity-Aware cluster scheduling policies for deep learning workloads","author":"Narayanan","journal-title":"OSDI 20"},{"key":"ref16","article-title":"Elastic resource sharing for distributed deep learning","author":"Hwang","journal-title":"NSDI 21"},{"key":"ref17","article-title":"Pollux: Co-adaptive cluster scheduling for goodput-optimized deep learning","author":"Qiao","journal-title":"OSDI 21"},{"key":"ref18","article-title":"Virtualflow: Decoupling deep learning models from the underlying hardware","author":"Or","journal-title":"MLSys 22"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1145\/3581784.3607054"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575721"},{"key":"ref21","article-title":"sonic: Application-aware data passing for chained serverless applications","author":"Mahgoub","journal-title":"ATC 21"},{"key":"ref22","article-title":"SAND: Towards High-Performance serverless computing","author":"Akkus","journal-title":"ATC 18"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/79173.79181"},{"key":"ref24","article-title":"GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism","author":"Huang","journal-title":"NeurIPS 19"},{"key":"ref25","volume-title":"Megatron-lm: Training billion parameter language models using model parallelism","author":"Shoeybi","year":"2019"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOMW.2018.8406869"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/INFCOM.2004.1354479"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-24777-7_11"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1145\/509593.509647"},{"key":"ref30","article-title":"CheckFreq: Frequent, Fine-Grained DNN checkpointing","author":"Mohan","journal-title":"FAST 21"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613145"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026899"},{"key":"ref33","article-title":"Pytorch: An imperative style, high-performance deep learning library","author":"Paszke","journal-title":"NeurIPS 19"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref35","article-title":"Learning multiple layers of features from tiny images","author":"Krizhevsky","year":"2009","journal-title":"Tech. Rep."},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref38","article-title":"Very deep convolutional networks for large-scale image recognition","author":"Simonyan","journal-title":"ICLR 15"},{"key":"ref39","article-title":"Google\u2019s neural machine translation system: Bridging the gap between human and machine translation","author":"Wu","journal-title":"CoRR 16"},{"key":"ref41","article-title":"Language models are unsupervised multitask learners","author":"Radford","year":"2019","journal-title":"OpenAI Blog"},{"key":"ref42","article-title":"Pointer sentinel mixture models","author":"Merity","journal-title":"ICLR 17"},{"key":"ref43","article-title":"BERT: Pretraining of deep bidirectional transformers for language understanding","author":"Devlin","journal-title":"NAACL HLT 19"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356137"},{"key":"ref48","article-title":"Accurate, large minibatch SGD: training imagenet in 1 hour","author":"Goyal","journal-title":"CoRR 17"},{"key":"ref49","article-title":"Adabatch: Adaptive batch sizes for training deep neural networks","author":"Devarakonda","journal-title":"CoRR 17"},{"key":"ref50","article-title":"Large batch optimization for deep learning: Training bert in 76 minutes","author":"You","journal-title":"ICLR 20"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613152"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/3694715.3695960"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3459240"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1145\/3485510"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/2886107.2886110"},{"key":"ref57","article-title":"Pacman: Coordinated memory caching for parallel jobs","author":"Ananthanarayanan","journal-title":"OSDI 12"}],"event":{"name":"2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)","location":"Hefei, China","start":{"date-parts":[[2025,12,14]]},"end":{"date-parts":[[2025,12,18]]}},"container-title":["2025 IEEE 31th International Conference on Parallel and Distributed Systems (ICPADS)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/11322805\/11322871\/11323035.pdf?arnumber=11323035","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,15]],"date-time":"2026-01-15T07:28:18Z","timestamp":1768462098000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11323035\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":50,"URL":"https:\/\/doi.org\/10.1109\/icpads67057.2025.11323035","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]}}}