{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,3]],"date-time":"2026-06-03T08:00:11Z","timestamp":1780473611386,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":31,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12]]},"DOI":"10.1145\/3773274.3774280","type":"proceedings-article","created":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T11:40:28Z","timestamp":1767181228000},"page":"1-10","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["When Timeouts Fail: Revisiting Fault Detection under Resource Stress in Edge Computing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0009-6639-6339","authenticated-orcid":false,"given":"Maryam","family":"Pourreza","sequence":"first","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9100-0943","authenticated-orcid":false,"given":"Priya","family":"Narasimhan","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,31]]},"reference":[{"key":"e_1_3_3_2_2_2","doi-asserted-by":"crossref","unstructured":"Saurabh Bagchi Muhammad-Bilal Siddiqui Paul Wood and Heng Zhang. 2020. Dependability in Edge Computing. Commun. ACM 63 1 (2020) 58\u201366.","DOI":"10.1145\/3362068"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"publisher","DOI":"10.1145\/3517206.3526269"},{"key":"e_1_3_3_2_4_2","unstructured":"Armon Dadgar James Phillips and Jon Currey. 2018. Lifeguard: SWIM-ing with Situational Awareness. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1707.00788 (2018)."},{"key":"e_1_3_3_2_5_2","doi-asserted-by":"publisher","DOI":"10.1109\/IC2E.2018.00022"},{"key":"e_1_3_3_2_6_2","unstructured":"Patrick Dendorfer Hamid Rezatofighi Anton Milan Javen Shi Daniel Cremers Ian Reid Stefan Roth Konrad Schindler and Laura Leal-Taix\u00e9. 2020. Mot20: A benchmark for multi object tracking in crowded scenes. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2003.09003 (2020)."},{"key":"e_1_3_3_2_7_2","first-page":"1127","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Dong Gen","year":"2025","unstructured":"Gen Dong, Yu Hua, Yongle Zhang, Zhangyu Chen, and Menglei Chen. 2025. Understanding and Detecting { Fail-Slow} Hardware Failure Bugs in Cloud Systems. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 1127\u20131142."},{"key":"e_1_3_3_2_8_2","doi-asserted-by":"crossref","unstructured":"Piotr Grzesik and Dariusz Mrozek. 2024. Combining machine learning and edge computing: Opportunities challenges platforms frameworks and use cases. Electronics 13 3 (2024) 640.","DOI":"10.3390\/electronics13030640"},{"key":"e_1_3_3_2_9_2","doi-asserted-by":"publisher","DOI":"10.1109\/MILCOM58377.2023.10356302"},{"key":"e_1_3_3_2_10_2","doi-asserted-by":"publisher","DOI":"10.5555\/1032662.1034350"},{"key":"e_1_3_3_2_11_2","first-page":"359","volume-title":"22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25)","author":"Lu Ruiming","year":"2025","unstructured":"Ruiming Lu, Yunchi Lu, Yuxuan Jiang, Guangtao Xue, and Peng Huang. 2025. { One-Size-Fits-None} : Understanding and Enhancing { Slow-Fault} Tolerance in Modern Distributed Systems. In 22nd USENIX Symposium on Networked Systems Design and Implementation (NSDI 25). 359\u2013378."},{"key":"e_1_3_3_2_12_2","first-page":"1005","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Lu Ruiming","year":"2022","unstructured":"Ruiming Lu, Erci Xu, Yiming Zhang, Zhaosheng Zhu, Mengtian Wang, Zongpeng Zhu, Guangtao Xue, Minglu Li, and Jiesheng Wu. 2022. { NVMe}{ SSD} failures in the field: the { Fail-Stop} and the { Fail-Slow}. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 1005\u20131020."},{"key":"e_1_3_3_2_13_2","first-page":"467","volume-title":"USENIX Annual Technical Conference (ATC)","author":"Ma Sixiang","year":"2018","unstructured":"Sixiang Ma and Yang Wang. 2018. Accurate Timeout Detection Despite Arbitrary Processing Delays. In USENIX Annual Technical Conference (ATC). 467\u2013480."},{"key":"e_1_3_3_2_14_2","unstructured":"Redowan Mahmud Ramamohanarao Kotagiri and Rajkumar Buyya. 2018. Fog Computing: A Taxonomy Survey and Future Directions. Internet of Things 6 (2018) 100\u2013128."},{"key":"e_1_3_3_2_15_2","doi-asserted-by":"publisher","DOI":"10.1145\/3318216.3363299"},{"key":"e_1_3_3_2_16_2","unstructured":"Ardit Memishi Besmir Bregasi and Laura Ricci. 2019. Failure Detection in MapReduce and Cloud Platforms: Algorithms and Case Study. J. Parallel and Distrib. Comput. 128 (2019) 103\u2013114."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3578354.3592873"},{"key":"e_1_3_3_2_18_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICFEC57925.2023.00018"},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/EDGE60047.2023.00021"},{"key":"e_1_3_3_2_20_2","unstructured":"The\u00a0Kubernetes Project. 2024. Kubernetes Node Heartbeats and Status (NodeMonitorGracePeriod). https:\/\/kubernetes.io\/docs\/reference\/node\/node-status\/."},{"key":"e_1_3_3_2_21_2","unstructured":"M.\u00a0A. et\u00a0al. Rahman. 2022. Privacy-Preserving Framework for Edge Computing: A Comprehensive Review. IEEE Communications Surveys & Tutorials 24 3 (2022) 1518\u20131545."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"R.\u00a0A. Rajagede M. Santriaji M.\u00a0A. Fikriansyah H.\u00a0H. Nuha Y. Fu and Y. Solihin. 2025. NAPER: Fault Protection for Real-Time Resource-Constrained Deep Neural Networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.06591 (2025).","DOI":"10.1109\/IOLTS65288.2025.11116827"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/EDGE55608.2022.00024"},{"key":"e_1_3_3_2_24_2","unstructured":"Ankur Rawat Rajiv Sushil and Sachin\u00a0K. Sahu. 2021. A new adaptive fault tolerant framework in the cloud. IETE Journal of Research 67 6 (2021) 796\u2013808."},{"key":"e_1_3_3_2_25_2","unstructured":"Banhisikha Ray Anindita Saha Prasenjit Chatterjee Bibhudatta Sahoo and Siddhartha Bhattacharyya. 2020. Proactive fault-tolerance technique to enhance reliability of cloud service in cloud federation environment. IEEE Transactions on Cloud Computing 8 2 (2020) 521\u2013534."},{"key":"e_1_3_3_2_26_2","first-page":"228","volume-title":"IEEE International Conference on Cloud Computing (CLOUD)","author":"Saadoon A.","year":"2021","unstructured":"A. Saadoon, N. Samaan, S. Baroud, and A. Bedeir. 2021. Experimental Evaluation of Heartbeat Timeout Impact on Hadoop Cluster Performance. In IEEE International Conference on Cloud Computing (CLOUD). 228\u2013237."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"crossref","unstructured":"Muntadher Saadoon Siti Hafizah\u00a0Ab Hamid Hazrina Sofian Hamza Altarturi Nur Nasuha Zati\u00a0Hakim Azizul Asmiza\u00a0Abdul Sani and Adeleh Asemi. 2021. Experimental analysis in Hadoop MapReduce: a closer look at fault detection and recovery techniques. Sensors 21 11 (2021) 3799.","DOI":"10.3390\/s21113799"},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Mahadev Satyanarayanan. 2017. The Emergence of Edge Computing. IEEE Computer 50 1 (2017) 30\u201339.","DOI":"10.1109\/MC.2017.9"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"crossref","unstructured":"W. Shi J. Cao Q. Zhang Y. Li and L. Xu. 2016. Edge Computing: Vision and Challenges. IEEE Internet of Things Journal 3 5 (2016) 637\u2013646.","DOI":"10.1109\/JIOT.2016.2579198"},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"publisher","DOI":"10.1145\/3318216.3363305"},{"key":"e_1_3_3_2_31_2","unstructured":"Li Wu Walid\u00a0A. Hanafy Tarek Abdelzaher David Irwin Jesse Milzman and Prashant Shenoy. 2025. FailLite: Failure-Resilient Model Serving for Resource-Constrained Edge Environments. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2504.15856 (2025)."},{"key":"e_1_3_3_2_32_2","first-page":"731","volume-title":"2025 USENIX Annual Technical Conference (USENIX ATC 25)","author":"Wu Tianyuan","year":"2025","unstructured":"Tianyuan Wu, Wei Wang, Yinghao Yu, Siran Yang, Wenchao Wu, Qinkai Duan, Guodong Yang, Jiamang Wang, Lin Qu, and Liping Zhang. 2025. { GREYHOUND} : Hunting { Fail-Slows} in { Hybrid-Parallel} Training at Scale. In 2025 USENIX Annual Technical Conference (USENIX ATC 25). 731\u2013747."}],"event":{"name":"UCC '25: 2025 IEEE\/ACM 18th International Conference on Utility and Cloud Computing","location":"France France","acronym":"UCC '25","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 18th IEEE\/ACM International Conference on Utility and Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3773274.3774280","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T11:42:14Z","timestamp":1767181334000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3773274.3774280"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12]]},"references-count":31,"alternative-id":["10.1145\/3773274.3774280","10.1145\/3773274"],"URL":"https:\/\/doi.org\/10.1145\/3773274.3774280","relation":{},"subject":[],"published":{"date-parts":[[2025,12]]},"assertion":[{"value":"2025-12-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}