{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T01:54:33Z","timestamp":1769824473903,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":19,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,14]],"date-time":"2022-08-14T00:00:00Z","timestamp":1660435200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,14]]},"DOI":"10.1145\/3534678.3539127","type":"proceedings-article","created":{"date-parts":[[2022,8,12]],"date-time":"2022-08-12T19:06:41Z","timestamp":1660331201000},"page":"4032-4040","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":4,"title":["NENYA: Cascade Reinforcement Learning for Cost-Aware Failure Mitigation at Microsoft 365"],"prefix":"10.1145","author":[{"given":"Lu","family":"Wang","sequence":"first","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Pu","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Chao","family":"Du","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Chuan","family":"Luo","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Mengna","family":"Su","sequence":"additional","affiliation":[{"name":"Microsoft 365, Suzhou, UNK, China"}]},{"given":"Fangkai","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Yudong","family":"Liu","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Qingwei","family":"Lin","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]},{"given":"Min","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft 365, Suzhou, China"}]},{"given":"Yingnong","family":"Dang","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Seattle, WA, USA"}]},{"given":"Hongyu","family":"Zhang","sequence":"additional","affiliation":[{"name":"The University of Newcastle, Newcastle, UNK, Australia"}]},{"given":"Saravan","family":"Rajmohan","sequence":"additional","affiliation":[{"name":"Microsoft 365, Seattle, WA, USA"}]},{"given":"Dongmei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, UNK, China"}]}],"member":"320","published-online":{"date-parts":[[2022,8,14]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939699"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/226643.226647"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2002.1004595"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126937"},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of ICML 2018","author":"Haarnoja Tuomas","year":"2018","unstructured":"Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine. 2018. Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor. In Proceedings of ICML 2018. PMLR, 1861--1870."},{"key":"e_1_3_2_1_6_1","volume-title":"Proceedings of ICML","volume":"1","author":"Hamerly Greg","year":"2001","unstructured":"Greg Hamerly, Charles Elkan, et al. 2001. Bayesian approaches to failure prediction for disk drives. In Proceedings of ICML 2001, Vol. 1. Citeseer, 202--209."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2019\/351"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2043556.2043583"},{"key":"e_1_3_2_1_10_1","volume-title":"Proceedings of OSDI","author":"Levy Sebastien","year":"2020","unstructured":"Sebastien Levy, Randolph Yao, YoujiangWu, Yingnong Dang, Peng Huang, Zheng Mu, Pu Zhao, Tarun Ramani, Naga Govindaraju, Xukun Li, et al. 2020. Narya: Predictive and adaptive failure mitigation to avert production cloud vm interruptions. In Proceedings of OSDI 2020."},{"key":"e_1_3_2_1_11_1","unstructured":"Timothy P Lillicrap Jonathan J Hunt Alexander Pritzel Nicolas Heess Tom Erez Yuval Tassa David Silver and Daan Wierstra. 2015. Continuous control with deep reinforcement learning. (2015)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","first-page":"539","DOI":"10.1109\/TSMCB.2008.2007853","article-title":"Exploratory undersampling for class-imbalance learning","volume":"39","author":"Liu Xu-Ying","year":"2008","unstructured":"Xu-Ying Liu, Jianxin Wu, and Zhi-Hua Zhou. 2008. Exploratory undersampling for class-imbalance learning. IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics) 39, 2 (2008), 539--550.","journal-title":"IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics)"},{"key":"e_1_3_2_1_13_1","volume-title":"18th {USENIX} Conference on File and Storage Technologies ({FAST} 20). 151--167.","author":"Lu Sidi","unstructured":"Sidi Lu, Bing Luo, Tirthak Patel, Yongtao Yao, Devesh Tiwari, and Weisong Shi. 2020. Making disk failure predictions smarter!. In 18th {USENIX} Conference on File and Storage Technologies ({FAST} 20). 151--167."},{"key":"e_1_3_2_1_14_1","first-page":"1181","article-title":"NTAM","volume":"2021","author":"Luo Chuan","year":"2021","unstructured":"Chuan Luo, Pu Zhao, Bo Qiao, Youjiang Wu, Hongyu Zhang, Wei Wu, Weihai Lu, Yingnong Dang, Saravanakumar Rajmohan, Qingwei Lin, et al. 2021. NTAM: Neighborhood-Temporal Attention Model for Disk Failure Prediction in Cloud Platforms. In Proceedings of WWW 2021. 1181--1191.","journal-title":"In Proceedings of WWW"},{"key":"e_1_3_2_1_15_1","article-title":"Curriculum learning for reinforcement learning domains: A framework and survey","volume":"21","author":"Narvekar Sanmit","year":"2020","unstructured":"Sanmit Narvekar, Bei Peng, Matteo Leonetti, Jivko Sinapov, Matthew E Taylor, and Peter Stone. 2020. Curriculum learning for reinforcement learning domains: A framework and survey. J. Mach. Learn. Res. 21 (2020), 181:1--181:50.","journal-title":"J. Mach. Learn. Res."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSME46990.2020.00027"},{"key":"e_1_3_2_1_17_1","article-title":"Visualizing data using t-SNE","volume":"9","author":"der Maaten Laurens Van","year":"2008","unstructured":"Laurens Van der Maaten and Geoffrey Hinton. 2008. Visualizing data using t-SNE. Journal of machine learning research 9, 11 (2008).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_18_1","volume-title":"Proceedings of ICML","author":"Vezhnevets Alexander Sasha","year":"2017","unstructured":"Alexander Sasha Vezhnevets, Simon Osindero, Tom Schaul, Nicolas Heess, Max Jaderberg, David Silver, and Koray Kavukcuoglu. 2017. Feudal networks for hierarchical reinforcement learning. In Proceedings of ICML 2017. PMLR, 3540--3549."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2001.990517"},{"key":"e_1_3_2_1_20_1","volume-title":"Proceedings of USENIX ATC","author":"Xu Yong","year":"2018","unstructured":"Yong Xu, Kaixin Sui, Randolph Yao, Hongyu Zhang, Qingwei Lin, Yingnong Dang, Peng Li, Keceng Jiang, Wenchi Zhang, Jian-Guang Lou, et al. 2018. Improving service availability of cloud systems by predicting disk error. In Proceedings of USENIX ATC 2018. 481--494."}],"event":{"name":"KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining","location":"Washington DC USA","acronym":"KDD '22","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data"]},"container-title":["Proceedings of the 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539127","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3534678.3539127","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:58Z","timestamp":1750186978000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3534678.3539127"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,14]]},"references-count":19,"alternative-id":["10.1145\/3534678.3539127","10.1145\/3534678"],"URL":"https:\/\/doi.org\/10.1145\/3534678.3539127","relation":{},"subject":[],"published":{"date-parts":[[2022,8,14]]},"assertion":[{"value":"2022-08-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}