{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T01:20:43Z","timestamp":1773451243192,"version":"3.50.1"},"reference-count":49,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"4","license":[{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,10,1]],"date-time":"2023-10-01T00:00:00Z","timestamp":1696118400000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62172316"],"award-info":[{"award-number":["62172316"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Soft Science Research Plans of Shaanxi Province","award":["2020KRZ018"],"award-info":[{"award-number":["2020KRZ018"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Cloud Comput."],"published-print":{"date-parts":[[2023,10]]},"DOI":"10.1109\/tcc.2023.3308161","type":"journal-article","created":{"date-parts":[[2023,8,24]],"date-time":"2023-08-24T17:25:57Z","timestamp":1692897957000},"page":"3631-3642","source":"Crossref","is-referenced-by-count":8,"title":["On a Meta Learning-Based Scheduler for Deep Learning Clusters"],"prefix":"10.1109","volume":"11","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3096-7741","authenticated-orcid":false,"given":"Jin","family":"Yang","sequence":"first","affiliation":[{"name":"Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1710-4335","authenticated-orcid":false,"given":"Liang","family":"Bao","sequence":"additional","affiliation":[{"name":"Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8207-1862","authenticated-orcid":false,"given":"Wenjing","family":"Liu","sequence":"additional","affiliation":[{"name":"Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3769-7630","authenticated-orcid":false,"given":"Rong","family":"Yang","sequence":"additional","affiliation":[{"name":"Xidian University, Xi&#x2019;an, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8218-1209","authenticated-orcid":false,"given":"Chase Q.","family":"Wu","sequence":"additional","affiliation":[{"name":"New Jersey Institute of Technology, Newark, NJ, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3457913.3457936"},{"issue":"1","key":"ref2","first-page":"88","article-title":"Horus: Interference-aware and prediction-based scheduling in deep learning systems","volume-title":"IEEE Trans. Parallel Distrib. Syst.","volume":"33","author":"Yeung","year":"2022"},{"issue":"8","key":"ref3","first-page":"1947","article-title":"DL2: A deep learning-driven scheduler for deep learning clusters","volume-title":"IEEE Trans. Parallel Distrib. Syst.","volume":"32","author":"Peng","year":"2021"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"ref5","first-page":"595","article-title":"Gandiva: Introspective cluster scheduling for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Des. Implementation","author":"Xiao"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1145\/3326285.3329065"},{"issue":"9","key":"ref8","first-page":"2419","article-title":"Challenges of real-world reinforcement learning: Definitions, benchmarks and analysis","volume-title":"Mach. Learn.","volume":"110","author":"Dulac-Arnold","year":"2021"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.73"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW.2017.70"},{"key":"ref11","doi-asserted-by":"crossref","DOI":"10.1631\/FITEE.2200297","article-title":"On the principles of parsimony and self-consistency for the emergence of intelligence","author":"Ma","year":"2022"},{"key":"ref12","article-title":"RL2: Fast reinforcement learning via slow reinforcement learning","author":"Duan","year":"2016"},{"key":"ref13","first-page":"1126","article-title":"Model-agnostic meta-learning for fast adaptation of deep networks","volume-title":"Proc. 34th Int. Conf. Mach. Learn.","author":"Finn"},{"key":"ref14","article-title":"Meta-learning curiosity algorithms","author":"Alet","year":"2020"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387555"},{"issue":"5","key":"ref16","first-page":"1128","article-title":"Elastic parameter server: Accelerating ML training with scalable resource scheduling","volume-title":"IEEE Trans. Parallel Distrib. Syst.","volume":"33","author":"Wang","year":"2021"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/INSAI54028.2021.00041"},{"key":"ref18","first-page":"721","article-title":"Elastic resource sharing for distributed deep learning","author":"Hwang","year":"2021","journal-title":"Proc. 18th USENIX Sympos. Netw. Syst. Des. Implementation"},{"key":"ref19","first-page":"533","article-title":"AntMan: Dynamic Scaling on GPU Clusters for Deep Learning","author":"Xiao","year":"2020","journal-title":"Proc. 14th USENIX Sympos. Oper. Syst. Des. Implementation"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/tcc.2022.3222649"},{"issue":"1","key":"ref21","first-page":"34","article-title":"Deep learning research and development platform: Characterizing and scheduling with QoS guarantees on GPU clusters","volume-title":"IEEE Trans. Parallel Distrib. Syst.","volume":"31","author":"Chen","year":"2020"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00094"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.5220\/0007707605690577"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737460"},{"issue":"4","key":"ref25","first-page":"948","article-title":"Online placement and scaling of geo-distributed machine learning jobs via volume-discounting brokerage","volume-title":"IEEE Trans. Parallel Distrib. Syst.","volume":"31","author":"Li","year":"2020"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/JCC53141.2021.00013"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM41043.2020.9155445"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"ref29","doi-asserted-by":"crossref","DOI":"10.1109\/MASCOTS50786.2020.9285954","article-title":"Effective elastic scaling of deep learning workloads","author":"Saxena","year":"2020"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337873"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-22698-4_4"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/IPCCC55026.2022.9894315"},{"key":"ref33","first-page":"153432","article-title":"SCARL: Attentive reinforcement learning-based scheduling in a multi-resource heterogeneous cluster","volume-title":"IEEE Access","volume":"7","author":"Cheong","year":"2019"},{"key":"ref34","first-page":"101390","article-title":"Deep reinforcement learning for minimizing tardiness in parallel machine scheduling with sequence dependent family setups","volume-title":"IEEE Access","volume":"9","author":"Paeng","year":"2021"},{"key":"ref35","article-title":"Learning to schedule multi-NUMA virtual machines via reinforcement learning","volume-title":"Pattern Recognit.","volume":"121","author":"Sheng","year":"2022"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-86380-7_50"},{"key":"ref37","doi-asserted-by":"crossref","DOI":"10.1109\/TPDS.2023.3313779","article-title":"Task placement and resource allocation for edge machine learning: A GNN-based multi-agent reinforcement learning paradigm","author":"Li","year":"2023"},{"issue":"1","key":"ref38","first-page":"58","article-title":"Machine learning feature based job scheduling for distributed machine learning clusters","volume-title":"IEEE\/ACM Trans. Netw.","volume":"31","author":"Wang","year":"2023"},{"key":"ref39","first-page":"83","article-title":"Towards topology aware pre-emptive job scheduling with deep reinforcement learning","volume-title":"Proc. 30th Annu. Int. Conf. Comput. Sci. Softw. Eng.","author":"Ryu"},{"key":"ref40","doi-asserted-by":"crossref","DOI":"10.1145\/3458817.3476209","article-title":"Efficient large-scale language model training on GPU clusters using megatron-LM","author":"Narayanan","year":"2021"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1145\/3377454"},{"key":"ref42","article-title":"Sparse allreduce: Efficient scalable communication for power-law data","author":"Zhao","year":"2013"},{"key":"ref43","article-title":"Playing Atari with deep reinforcement learning","author":"Mnih","year":"2013"},{"key":"ref44","article-title":"Proximal policy optimization algorithms","author":"Schulman","year":"2017"},{"key":"ref45","article-title":"Challenges of real-world reinforcement learning","author":"Dulac-Arnold"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2018.8486422"},{"key":"ref47","first-page":"947","article-title":"Analysis of large-scale multi-tenant GPU clusters for DNN training workloads","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Jeon"},{"key":"ref48","first-page":"289","article-title":"THEMIS: Fair and eficient GPU cluster scheduling","volume-title":"Proc. 17th USENIX Sympos. Netw. Syst. Des. Implementation","author":"Mahajan","year":"2020"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2523616.2523633"}],"container-title":["IEEE Transactions on Cloud Computing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6245519\/10345387\/10229506.pdf?arnumber=10229506","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,12,19]],"date-time":"2023-12-19T23:30:27Z","timestamp":1703028627000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10229506\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10]]},"references-count":49,"journal-issue":{"issue":"4"},"URL":"https:\/\/doi.org\/10.1109\/tcc.2023.3308161","relation":{},"ISSN":["2168-7161","2372-0018"],"issn-type":[{"value":"2168-7161","type":"electronic"},{"value":"2372-0018","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,10]]}}}