{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,6]],"date-time":"2026-05-06T23:05:37Z","timestamp":1778108737484,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":126,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,5,8]],"date-time":"2023-05-08T00:00:00Z","timestamp":1683504000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2130560"],"award-info":[{"award-number":["CNS-2130560"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,5,8]]},"DOI":"10.1145\/3552326.3587448","type":"proceedings-article","created":{"date-parts":[[2023,5,5]],"date-time":"2023-05-05T17:33:02Z","timestamp":1683307982000},"page":"433-451","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Fail through the Cracks: Cross-System Interaction Failures in Modern Cloud Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0001-1757-7290","authenticated-orcid":false,"given":"Lilia","family":"Tang","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-1683-3110","authenticated-orcid":false,"given":"Chaitanya","family":"Bhandari","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5350-5182","authenticated-orcid":false,"given":"Yongle","family":"Zhang","sequence":"additional","affiliation":[{"name":"Purdue University, West Lafayette, IN, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0362-6821","authenticated-orcid":false,"given":"Anna","family":"Karanika","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-0712-9646","authenticated-orcid":false,"given":"Shuyang","family":"Ji","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9372-5937","authenticated-orcid":false,"given":"Indranil","family":"Gupta","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana-Champaign, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4443-8170","authenticated-orcid":false,"given":"Tianyin","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Urbana-Champaign, IL, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,5,8]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"ANSI Compliance. https:\/\/spark.apache.org\/docs\/latest\/sql-ref-ansi-compliance.html.  ANSI Compliance. https:\/\/spark.apache.org\/docs\/latest\/sql-ref-ansi-compliance.html."},{"key":"e_1_3_2_1_2_1","unstructured":"Apache Avro. https:\/\/avro.apache.org\/.  Apache Avro. https:\/\/avro.apache.org\/."},{"key":"e_1_3_2_1_3_1","unstructured":"Apache Flink. https:\/\/flink.apache.org\/.  Apache Flink. https:\/\/flink.apache.org\/."},{"key":"e_1_3_2_1_4_1","unstructured":"Apache Hadoop YARN. https:\/\/hadoop.apache.org\/docs\/current\/hadoop-yarn\/hadoop-yarn-site\/YARN.html.  Apache Hadoop YARN. https:\/\/hadoop.apache.org\/docs\/current\/hadoop-yarn\/hadoop-yarn-site\/YARN.html."},{"key":"e_1_3_2_1_5_1","unstructured":"Apache HBase. https:\/\/hbase.apache.org\/book.html.  Apache HBase. https:\/\/hbase.apache.org\/book.html."},{"key":"e_1_3_2_1_6_1","unstructured":"Apache Hive. https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/Tutorial.  Apache Hive. https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/Tutorial."},{"key":"e_1_3_2_1_7_1","unstructured":"Apache Kafka. https:\/\/kafka.apache.org\/documentation\/.  Apache Kafka. https:\/\/kafka.apache.org\/documentation\/."},{"key":"e_1_3_2_1_8_1","unstructured":"Apache ORC: The smallest fastest columnar storage for Hadoop workloads. https:\/\/orc.apache.org\/.  Apache ORC: The smallest fastest columnar storage for Hadoop workloads. https:\/\/orc.apache.org\/."},{"key":"e_1_3_2_1_9_1","unstructured":"Apache Parquet. https:\/\/parquet.apache.org\/.  Apache Parquet. https:\/\/parquet.apache.org\/."},{"key":"e_1_3_2_1_10_1","unstructured":"Apache Spark Codebase. https:\/\/github.com\/apache\/spark.  Apache Spark Codebase. https:\/\/github.com\/apache\/spark."},{"key":"e_1_3_2_1_11_1","unstructured":"Apache Spark Website. https:\/\/spark.apache.org\/.  Apache Spark Website. https:\/\/spark.apache.org\/."},{"key":"e_1_3_2_1_12_1","unstructured":"ASF JIRA. https:\/\/issues.apache.org\/jira\/secure\/Dashboard.jspa.  ASF JIRA. https:\/\/issues.apache.org\/jira\/secure\/Dashboard.jspa."},{"key":"e_1_3_2_1_13_1","unstructured":"AWS Post-Event Summaries. https:\/\/aws.amazon.com\/premiumsupport\/technology\/pes\/.  AWS Post-Event Summaries. https:\/\/aws.amazon.com\/premiumsupport\/technology\/pes\/."},{"key":"e_1_3_2_1_14_1","unstructured":"Azure status history. https:\/\/status.azure.com\/en-us\/status\/history\/.  Azure status history. https:\/\/status.azure.com\/en-us\/status\/history\/."},{"key":"e_1_3_2_1_15_1","unstructured":"Data types (Databricks SQL). https:\/\/docs.databricks.com\/sql\/language-manual\/sql-ref-datatypes.html#data-types-databricks-sql.  Data types (Databricks SQL). https:\/\/docs.databricks.com\/sql\/language-manual\/sql-ref-datatypes.html#data-types-databricks-sql."},{"key":"e_1_3_2_1_16_1","unstructured":"Dynamic Tables. https:\/\/nightlies.apache.org\/flink\/flink-docs-release-1.14\/docs\/dev\/table\/concepts\/dynamic_tables\/.  Dynamic Tables. https:\/\/nightlies.apache.org\/flink\/flink-docs-release-1.14\/docs\/dev\/table\/concepts\/dynamic_tables\/."},{"key":"e_1_3_2_1_17_1","unstructured":"Google Cloud Service Health. https:\/\/status.cloud.google.com\/summary.  Google Cloud Service Health. https:\/\/status.cloud.google.com\/summary."},{"key":"e_1_3_2_1_18_1","unstructured":"Google was hit with massive outage including youtube gmail and google classroom | cnn business. https:\/\/www.cnn.com\/2020\/12\/14\/tech\/google-youtube-gmail-down\/index.html.  Google was hit with massive outage including youtube gmail and google classroom | cnn business. https:\/\/www.cnn.com\/2020\/12\/14\/tech\/google-youtube-gmail-down\/index.html."},{"key":"e_1_3_2_1_19_1","unstructured":"Google's apps crash in a worldwide outage. - the new york times. https:\/\/www.nytimes.com\/2020\/12\/14\/business\/google-down-worldwide.html.  Google's apps crash in a worldwide outage. - the new york times. https:\/\/www.nytimes.com\/2020\/12\/14\/business\/google-down-worldwide.html."},{"key":"e_1_3_2_1_20_1","unstructured":"Hadoop Common. https:\/\/github.com\/apache\/hadoop\/tree\/trunk\/hadoop-common-project.  Hadoop Common. https:\/\/github.com\/apache\/hadoop\/tree\/trunk\/hadoop-common-project."},{"key":"e_1_3_2_1_21_1","unstructured":"Hadoop Distributed File System. http:\/\/hadoop.apache.org\/docs\/current\/hadoop-project-dist\/hadoop-hdfs\/HdfsDesign.html.  Hadoop Distributed File System. http:\/\/hadoop.apache.org\/docs\/current\/hadoop-project-dist\/hadoop-hdfs\/HdfsDesign.html."},{"key":"e_1_3_2_1_22_1","unstructured":"Implicit assumptions of the Hadoop FileSystem APIs. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-project-dist\/hadoop-common\/filesystem\/introduction.html#Implicit_assumptions_of_the_Hadoop_FileSystem_APIs.  Implicit assumptions of the Hadoop FileSystem APIs. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-project-dist\/hadoop-common\/filesystem\/introduction.html#Implicit_assumptions_of_the_Hadoop_FileSystem_APIs."},{"key":"e_1_3_2_1_23_1","unstructured":"Incident affecting Google App Engine. https:\/\/status.cloud.google.com\/incidents\/NuaWbbv8n8V8PMHNR7kT.  Incident affecting Google App Engine. https:\/\/status.cloud.google.com\/incidents\/NuaWbbv8n8V8PMHNR7kT."},{"key":"e_1_3_2_1_24_1","unstructured":"Incident affecting Google BigQuery. https:\/\/status.cloud.google.com\/incidents\/qq7VS3aLtp6Nmgs5Nux4.  Incident affecting Google BigQuery. https:\/\/status.cloud.google.com\/incidents\/qq7VS3aLtp6Nmgs5Nux4."},{"key":"e_1_3_2_1_25_1","unstructured":"Incident affecting Google Cloud Infrastructure Components Google Cloud Support Google Cloud Console Google BigQuery Google Cloud Storage Google Cloud Networking Google Kubernetes Engine Virtual Private Cloud (VPC). https:\/\/status.cloud.google.com\/incidents\/cFXPsFUnUELR8U2bQeGz.  Incident affecting Google Cloud Infrastructure Components Google Cloud Support Google Cloud Console Google BigQuery Google Cloud Storage Google Cloud Networking Google Kubernetes Engine Virtual Private Cloud (VPC). https:\/\/status.cloud.google.com\/incidents\/cFXPsFUnUELR8U2bQeGz."},{"key":"e_1_3_2_1_26_1","unstructured":"Incident affecting Google Compute Engine Google Cloud Networking Access Approval Google App Engine. https:\/\/status.cloud.google.com\/incidents\/1tX748pbxW2JjTUuTJsx.  Incident affecting Google Compute Engine Google Cloud Networking Access Approval Google App Engine. https:\/\/status.cloud.google.com\/incidents\/1tX748pbxW2JjTUuTJsx."},{"key":"e_1_3_2_1_27_1","unstructured":"Integration with Cloud Infrastructures. https:\/\/spark.apache.org\/docs\/latest\/cloud-integration.html.  Integration with Cloud Infrastructures. https:\/\/spark.apache.org\/docs\/latest\/cloud-integration.html."},{"key":"e_1_3_2_1_28_1","unstructured":"Massive google outage takes millions offline. https:\/\/www.forbes.com\/sites\/paulmonckton\/2020\/12\/14\/massive-google-outage-takes-millions-offline\/?sh=40f33d060ad1.  Massive google outage takes millions offline. https:\/\/www.forbes.com\/sites\/paulmonckton\/2020\/12\/14\/massive-google-outage-takes-millions-offline\/?sh=40f33d060ad1."},{"key":"e_1_3_2_1_29_1","unstructured":"Protocol buffers guide. https:\/\/developers.google.com\/protocol-buffers\/docs\/proto.  Protocol buffers guide. https:\/\/developers.google.com\/protocol-buffers\/docs\/proto."},{"key":"e_1_3_2_1_30_1","unstructured":"Spark SQL Guide. https:\/\/spark.apache.org\/docs\/latest\/sql-programming-guide.html.  Spark SQL Guide. https:\/\/spark.apache.org\/docs\/latest\/sql-programming-guide.html."},{"key":"e_1_3_2_1_31_1","volume-title":"Nov.","author":"Mars Climate Orbiter Mishap I","year":"1999","unstructured":"Mars Climate Orbiter Mishap Investigation Board Phase I Report . https:\/\/llis.nasa.gov\/llis_lib\/pdf\/1009464main1_0641-mr.pdf , Nov. 1999 . Mars Climate Orbiter Mishap Investigation Board Phase I Report. https:\/\/llis.nasa.gov\/llis_lib\/pdf\/1009464main1_0641-mr.pdf, Nov. 1999."},{"key":"e_1_3_2_1_32_1","volume-title":"Cloud Bug Study Database. https:\/\/ucare.cs.uchicago.edu\/projects\/cbs\/","author":"CBS","year":"2014","unstructured":"CBS : Cloud Bug Study Database. https:\/\/ucare.cs.uchicago.edu\/projects\/cbs\/ , 2014 . CBS: Cloud Bug Study Database. https:\/\/ucare.cs.uchicago.edu\/projects\/cbs\/, 2014."},{"key":"e_1_3_2_1_33_1","volume-title":"Nov.","author":"Apache Hive SQL","year":"2018","unstructured":"Apache Hive SQL Conformance . https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/Apache+Hive+SQL+Conformance , Nov. 2018 . Apache Hive SQL Conformance. https:\/\/cwiki.apache.org\/confluence\/display\/Hive\/Apache+Hive+SQL+Conformance, Nov. 2018."},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20)","author":"Alfatafta M.","year":"2020","unstructured":"Alfatafta , M. , Alkhatib , B. , Alqraan , A. , and Al-Kiswany , S . Toward a Generic Fault Tolerance Technique for Partial Network Partitioning . In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) ( Nov. 2020 ). Alfatafta, M., Alkhatib, B., Alqraan, A., and Al-Kiswany, S. Toward a Generic Fault Tolerance Technique for Partial Network Partitioning. In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) (Nov. 2020)."},{"key":"e_1_3_2_1_35_1","volume-title":"Fault injection in production: Making the case for resilience testing. Communications of the ACM (CACM) 55, 10 (Oct","author":"Allspaw J.","year":"2012","unstructured":"Allspaw , J. Fault injection in production: Making the case for resilience testing. Communications of the ACM (CACM) 55, 10 (Oct 2012 ), 48--52. Allspaw, J. Fault injection in production: Making the case for resilience testing. Communications of the ACM (CACM) 55, 10 (Oct 2012), 48--52."},{"key":"e_1_3_2_1_36_1","unstructured":"Alluxio Docs. The Need for a New Data Orchestration Platform. https:\/\/www.alluxio.io\/data-orchestration\/.  Alluxio Docs. The Need for a New Data Orchestration Platform. https:\/\/www.alluxio.io\/data-orchestration\/."},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18)","author":"Alqraan A.","year":"2018","unstructured":"Alqraan , A. , Takruri , H. , Alfatafta , M. , and Al-Kiswany , S . An Analysis of Network-Partitioning Failures in Cloud Systems . In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18) ( Oct. 2018 ). Alqraan, A., Takruri, H., Alfatafta, M., and Al-Kiswany, S. An Analysis of Network-Partitioning Failures in Cloud Systems. In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18) (Oct. 2018)."},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 6th Workshop on Hot Topics in System Dependability (HotDep'10)","author":"Altekar G.","year":"2010","unstructured":"Altekar , G. , and Stoica , I . Focus Replay Debugging Effort on the Control Plane . In Proceedings of the 6th Workshop on Hot Topics in System Dependability (HotDep'10) ( Oct. 2010 ). Altekar, G., and Stoica, I. Focus Replay Debugging Effort on the Control Plane. In Proceedings of the 6th Workshop on Hot Topics in System Dependability (HotDep'10) (Oct. 2010)."},{"key":"e_1_3_2_1_39_1","first-page":"12","volume":"45","author":"Amann S.","year":"2019","unstructured":"Amann , S. , Nguyen , H. A. , Nadi , S. , Nguyen , T. N. , and Mezini , M. A Systematic Evaluation of Static API-Misuse Detectors. IEEE Transactions on Software Engineering 45 , 12 ( Dec. 2019 ), 1170--1188. Amann, S., Nguyen, H. A., Nadi, S., Nguyen, T. N., and Mezini, M. A Systematic Evaluation of Static API-Misuse Detectors. IEEE Transactions on Software Engineering 45, 12 (Dec. 2019), 1170--1188.","journal-title":"A Systematic Evaluation of Static API-Misuse Detectors. IEEE Transactions on Software Engineering"},{"key":"e_1_3_2_1_40_1","volume-title":"Proceedings of the 10th USENIX Conference on Operating Systems Design and Implementation (OSDI'12)","author":"Attariyan M.","year":"2012","unstructured":"Attariyan , M. , Chow , M. , and Flinn , J . X-ray: Automating Root-Cause Diagnosis of Performance Anomalies in Production Software . In Proceedings of the 10th USENIX Conference on Operating Systems Design and Implementation (OSDI'12) ( Oct. 2012 ). Attariyan, M., Chow, M., and Flinn, J. X-ray: Automating Root-Cause Diagnosis of Performance Anomalies in Production Software. In Proceedings of the 10th USENIX Conference on Operating Systems Design and Implementation (OSDI'12) (Oct. 2012)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/1924943.1924960"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/MS.2016.60"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2786805.2786869"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/2950290.2950325"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465286"},{"key":"e_1_3_2_1_46_1","unstructured":"Brooker M. The Fundamental Mechanism of Scaling. http:\/\/brooker.co.za\/blog\/2021\/01\/22\/cloud-scale.html 2020.  Brooker M. The Fundamental Mechanism of Scaling. http:\/\/brooker.co.za\/blog\/2021\/01\/22\/cloud-scale.html 2020."},{"key":"e_1_3_2_1_47_1","first-page":"5","volume":"59","author":"Burns B.","year":"2016","unstructured":"Burns , B. , Grant , B. , Oppenheimer , D. , Brewer , E. , and Wilkes , J. Borg , Omega, and Kubernetes. Communications of the ACM 59 , 5 ( May 2016 ), 50--57. Burns, B., Grant, B., Oppenheimer, D., Brewer, E., and Wilkes, J. Borg, Omega, and Kubernetes. Communications of the ACM 59, 5 (May 2016), 50--57.","journal-title":"Kubernetes. Communications of the ACM"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2019.00040"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3409727"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/502034.502042"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338906.3338916"},{"key":"e_1_3_2_1_52_1","unstructured":"Databricks Docs. Databricks architecture overview. https:\/\/docs.databricks.com\/getting-started\/overview.html.  Databricks Docs. Databricks architecture overview. https:\/\/docs.databricks.com\/getting-started\/overview.html."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/502034.502041"},{"key":"e_1_3_2_1_54_1","unstructured":"Flink Docs. Checkpointing. https:\/\/nightlies.apache.org\/flink\/flink-docs-release-1.14\/docs\/dev\/datastream\/fault-tolerance\/checkpointing\/.  Flink Docs. Checkpointing. https:\/\/nightlies.apache.org\/flink\/flink-docs-release-1.14\/docs\/dev\/datastream\/fault-tolerance\/checkpointing\/."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.5555\/1924943.1924948"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3236024.3236030"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2815675.2815684"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/945445.945450"},{"key":"e_1_3_2_1_59_1","unstructured":"Google Cloud. What is a hybrid cloud? https:\/\/cloud.google.com\/learn\/what-is-hybrid-cloud.  Google Cloud. What is a hybrid cloud? https:\/\/cloud.google.com\/learn\/what-is-hybrid-cloud."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934891"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3180155.3180199"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/2670979.2670986"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987583"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.5555\/3189759.3189761"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of the 24th ACM SIGSOFT International Symposium on Foundations of Software Engineering (FSE'16)","author":"Gyori A.","year":"2016","unstructured":"Gyori , A. , Lambeth , B. , Shi , A. , Legunsen , O. , and Marinov , D . Non-Dex: A Tool for Detecting and Debugging Wrong Assumptions on Java API Specifications . In Proceedings of the 24th ACM SIGSOFT International Symposium on Foundations of Software Engineering (FSE'16) ( Nov. 2016 ). Gyori, A., Lambeth, B., Shi, A., Legunsen, O., and Marinov, D. Non-Dex: A Tool for Detecting and Debugging Wrong Assumptions on Java API Specifications. In Proceedings of the 24th ACM SIGSOFT International Symposium on Foundations of Software Engineering (FSE'16) (Nov. 2016)."},{"key":"e_1_3_2_1_66_1","unstructured":"HBase Docs. HBase Cluster Replication. https:\/\/hbase.apache.org\/book.html#_cluster_replication.  HBase Docs. HBase Cluster Replication. https:\/\/hbase.apache.org\/book.html#_cluster_replication."},{"key":"e_1_3_2_1_67_1","unstructured":"HBase Docs. HBase Write Ahead Log. https:\/\/hbase.apache.org\/book.html#wal.  HBase Docs. HBase Write Ahead Log. https:\/\/hbase.apache.org\/book.html#wal."},{"key":"e_1_3_2_1_68_1","unstructured":"HDFS Docs. Data Replication. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-project-dist\/hadoop-hdfs\/HdfsDesign.html#Data_Replication.  HDFS Docs. Data Replication. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-project-dist\/hadoop-hdfs\/HdfsDesign.html#Data_Replication."},{"key":"e_1_3_2_1_69_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22)","author":"Huang L.","year":"2022","unstructured":"Huang , L. , Magnusson , M. , Muralikrishna , A. B. , Estyak , S. , Isaacs , R. , Aghayev , A. , Zhu , T. , and Charapko , A . Metastable Failures in the Wild . In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) ( July 2022 ). Huang, L., Magnusson, M., Muralikrishna, A. B., Estyak, S., Isaacs, R., Aghayev, A., Zhu, T., and Charapko, A. Metastable Failures in the Wild. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) (July 2022)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3102980.3103005"},{"key":"e_1_3_2_1_71_1","unstructured":"Istio Docs. Architecture. https:\/\/istio.io\/latest\/docs\/ops\/deployment\/architecture\/.  Istio Docs. Architecture. https:\/\/istio.io\/latest\/docs\/ops\/deployment\/architecture\/."},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE43902.2021.00021"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/1134285.1134323"},{"key":"e_1_3_2_1_74_1","unstructured":"Kafka Docs. Auto Restart. https:\/\/kafka.apache.org\/documentation\/streams\/architecture#streams_architecture_recovery.  Kafka Docs. Auto Restart. https:\/\/kafka.apache.org\/documentation\/streams\/architecture#streams_architecture_recovery."},{"key":"e_1_3_2_1_75_1","unstructured":"Kubernetes Docs. Control Plane Components. https:\/\/kubernetes.io\/docs\/concepts\/overview\/components\/.  Kubernetes Docs. Control Plane Components. https:\/\/kubernetes.io\/docs\/concepts\/overview\/components\/."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872374"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/2970276.2970356"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359638"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3317550.3321438"},{"key":"e_1_3_2_1_80_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22)","author":"Lou C.","year":"2022","unstructured":"Lou , C. , Chen , C. , Huang , P. , Dang , Y. , Qin , S. , Yang , X. , Li , X. , Lin , Q. , and Chintalapati , M . RESIN: A Holistic Service for Dealing with Memory Leaks in Production Cloud Infrastructure . In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) ( July 2022 ). Lou, C., Chen, C., Huang, P., Dang, Y., Qin, S., Yang, X., Li, X., Lin, Q., and Chintalapati, M. RESIN: A Holistic Service for Dealing with Memory Leaks in Production Cloud Infrastructure. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) (July 2022)."},{"key":"e_1_3_2_1_81_1","volume-title":"Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20)","author":"Lou C.","year":"2020","unstructured":"Lou , C. , Huang , P. , and Smith , S . Understanding, Detecting and Localizing Partial Failures in Large System Software . In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) ( Feb. 2020 ). Lou, C., Huang, P., and Smith, S. Understanding, Detecting and Localizing Partial Failures in Large System Software. In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) (Feb. 2020)."},{"key":"e_1_3_2_1_82_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22)","author":"Lou C.","year":"2022","unstructured":"Lou , C. , Jing , Y. , and Huang , P . Demystifying and Checking Silent Semantic Violations in Large Distributed Systems . In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) ( July 2022 ). Lou, C., Jing, Y., and Huang, P. Demystifying and Checking Silent Semantic Violations in Large Distributed Systems. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI'22) (July 2022)."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/2560012"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/1353534.1346323"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359651"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456250"},{"key":"e_1_3_2_1_87_1","doi-asserted-by":"publisher","DOI":"10.1145\/2814330"},{"key":"e_1_3_2_1_88_1","volume-title":"Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20)","author":"Mehta S.","year":"2020","unstructured":"Mehta , S. , Bhagwan , R. , Kumar , R. , Ashok , B. , Bansal , C. , Maddila , C. , Bird , C. , Asthana , S. , and Kumar , A . Rex: Preventing Bugs and Misconfiguration in Large Services using Correlated Change Analysis . In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) ( Feb. 2020 ). Mehta, S., Bhagwan, R., Kumar, R., Ashok, B., Bansal, C., Maddila, C., Bird, C., Asthana, S., and Kumar, A. Rex: Preventing Bugs and Misconfiguration in Large Services using Correlated Change Analysis. In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) (Feb. 2020)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1145\/2815400.2815422"},{"key":"e_1_3_2_1_90_1","volume-title":"Proceedings of the 6th USENIX Conference on Operating Systems Design and Implementation (OSDI'04)","author":"Nagaraja K.","year":"2004","unstructured":"Nagaraja , K. , Oliveira , F. , Bianchini , R. , Martin , R. P. , and Nguyen , T. D . Understanding and Dealing with Operator Mistakes in Internet Services . In Proceedings of the 6th USENIX Conference on Operating Systems Design and Implementation (OSDI'04) ( Dec. 2004 ). Nagaraja, K., Oliveira, F., Bianchini, R., Martin, R. P., and Nguyen, T. D. Understanding and Dealing with Operator Mistakes in Internet Services. In Proceedings of the 6th USENIX Conference on Operating Systems Design and Implementation (OSDI'04) (Dec. 2004)."},{"key":"e_1_3_2_1_91_1","unstructured":"OpenStack Docs. Logical architecture. https:\/\/docs.openstack.org\/install-guide\/get-started-logical-architecture.html.  OpenStack Docs. Logical architecture. https:\/\/docs.openstack.org\/install-guide\/get-started-logical-architecture.html."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.5555\/1251460.1251461"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/3092703.3098219"},{"key":"e_1_3_2_1_94_1","volume-title":"Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI'14)","author":"Pillai T. S.","year":"2014","unstructured":"Pillai , T. S. , Chidambaram , V. , Alagappan , R. , Al-Kiswany , S. , Arpaci-Dusseau , A. C. , and Arpaci-Dusseau , R. H . All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications . In Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI'14) ( Oct. 2014 ). Pillai, T. S., Chidambaram, V., Alagappan, R., Al-Kiswany, S., Arpaci-Dusseau, A. C., and Arpaci-Dusseau, R. H. All File Systems Are Not Created Equal: On the Complexity of Crafting Crash-Consistent Applications. In Proceedings of the 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI'14) (Oct. 2014)."},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/1985793.1985812"},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1109\/MS.2012.73"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555228.1555269"},{"key":"e_1_3_2_1_98_1","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20)","author":"Rigger M.","year":"2020","unstructured":"Rigger , M. , and Su , Z . Testing Database Engines via Pivoted Query Synthesis . In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) ( Nov. 2020 ). Rigger, M., and Su, Z. Testing Database Engines via Pivoted Query Synthesis. In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) (Nov. 2020)."},{"key":"e_1_3_2_1_99_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSE.2012.63"},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2003.1240326"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519591"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465301"},{"key":"e_1_3_2_1_103_1","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20)","author":"Sun X.","year":"2020","unstructured":"Sun , X. , Cheng , R. , Chen , J. , Ang , E. , Legunsen , O. , and Xu , T . Testing Configuration Changes in Context to Prevent Production Failures . In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) ( Nov. 2020 ). Sun, X., Cheng, R., Chen, J., Ang, E., Legunsen, O., and Xu, T. Testing Configuration Changes in Context to Prevent Production Failures. In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI'20) (Nov. 2020)."},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465276"},{"key":"e_1_3_2_1_105_1","volume-title":"The Calculus of Service Availability. Communications of the ACM (CACM) 60, 9 (Sept","author":"Treynor B.","year":"2017","unstructured":"Treynor , B. , Dahlin , M. , Rau , V. , and Beyer , B . The Calculus of Service Availability. Communications of the ACM (CACM) 60, 9 (Sept . 2017 ), 42--47. Treynor, B., Dahlin, M., Rau, V., and Beyer, B. The Calculus of Service Availability. Communications of the ACM (CACM) 60, 9 (Sept. 2017), 42--47."},{"key":"e_1_3_2_1_106_1","volume-title":"The Supercloud: Applying Internet Design Principles to Interconnecting Clouds","author":"van Renesse R.","year":"2018","unstructured":"van Renesse , R. , Weatherspoon , H. , Shen , Z. , and Song , W . The Supercloud: Applying Internet Design Principles to Interconnecting Clouds . In IEEE Internet Computing (IEEE Internet Computing '18) ( Mar. 2018 ). van Renesse, R., Weatherspoon, H., Shen, Z., and Song, W. The Supercloud: Applying Internet Design Principles to Interconnecting Clouds. In IEEE Internet Computing (IEEE Internet Computing'18) (Mar. 2018)."},{"key":"e_1_3_2_1_107_1","volume-title":"Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18)","author":"Veeraraghavan K.","year":"2018","unstructured":"Veeraraghavan , K. , Meza , J. , Michelson , S. , Panneerselvam , S. , Gyori , A. , Chou , D. , Margulis , S. , Obenshain , D. , Padmanabha , S. , Shah , A. , Song , Y. J. , and Xu , T . Maelstrom: Mitigating Datacenter-level Disasters by Draining Interdependent Traffic Safely and Efficiently . In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18) ( Oct. 2018 ). Veeraraghavan, K., Meza, J., Michelson, S., Panneerselvam, S., Gyori, A., Chou, D., Margulis, S., Obenshain, D., Padmanabha, S., Shah, A., Song, Y. J., and Xu, T. Maelstrom: Mitigating Datacenter-level Disasters by Draining Interdependent Traffic Safely and Efficiently. In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI'18) (Oct. 2018)."},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380426"},{"key":"e_1_3_2_1_109_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE.2019.00093"},{"key":"e_1_3_2_1_110_1","doi-asserted-by":"publisher","DOI":"10.1145\/3377811.3380357"},{"key":"e_1_3_2_1_111_1","first-page":"1","volume":"17","author":"Xia W.","year":"2014","unstructured":"Xia , W. , Wen , Y. , Foh , C. H. , Niyato , D. , and Xie , H. A Survey on Software-Defined Networking. IEEE Communications Surveys & Tutorials 17 , 1 ( June 2014 ), 27--51. Xia, W., Wen, Y., Foh, C. H., Niyato, D., and Xie, H. A Survey on Software-Defined Networking. IEEE Communications Surveys & Tutorials 17, 1 (June 2014), 27--51.","journal-title":"A Survey on Software-Defined Networking. IEEE Communications Surveys & Tutorials"},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026925"},{"key":"e_1_3_2_1_113_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522727"},{"key":"e_1_3_2_1_114_1","doi-asserted-by":"publisher","DOI":"10.1145\/2791577"},{"key":"e_1_3_2_1_115_1","volume-title":"Modeling API Traffic to Catch Breaking Changes. https:\/\/www.akitasoftware.com\/blog-posts\/modeling-api-traffic-to-catch-breaking-changes","author":"Yang J.","year":"2021","unstructured":"Yang , J. Modeling API Traffic to Catch Breaking Changes. https:\/\/www.akitasoftware.com\/blog-posts\/modeling-api-traffic-to-catch-breaking-changes , 2021 . Yang, J. Modeling API Traffic to Catch Breaking Changes. https:\/\/www.akitasoftware.com\/blog-posts\/modeling-api-traffic-to-catch-breaking-changes, 2021."},{"key":"e_1_3_2_1_116_1","unstructured":"YARN Docs. YARN ResourceManager HA. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-yarn\/hadoop-yarn-site\/ResourceManagerHA.html.  YARN Docs. YARN ResourceManager HA. https:\/\/hadoop.apache.org\/docs\/stable\/hadoop-yarn\/hadoop-yarn-site\/ResourceManagerHA.html."},{"key":"e_1_3_2_1_117_1","doi-asserted-by":"publisher","DOI":"10.1145\/2043556.2043572"},{"key":"e_1_3_2_1_118_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458336.3465299"},{"key":"e_1_3_2_1_119_1","first-page":"2","volume":"22","author":"Yoo S.","year":"2012","unstructured":"Yoo , S. , and Harman , M. Regression Testing Minimisation , Selection and Prioritization: A Survey . Software Testing, Verification , and Reliability 22 , 2 ( Mar. 2012 ), 67--120. Yoo, S., and Harman, M. Regression Testing Minimisation, Selection and Prioritization: A Survey. Software Testing, Verification, and Reliability 22, 2 (Mar. 2012), 67--120.","journal-title":"Reliability"},{"key":"e_1_3_2_1_120_1","volume-title":"Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation (OSDI'14)","author":"Yuan D.","year":"2014","unstructured":"Yuan , D. , Luo , Y. , Zhuang , X. , Rodrigues , G. , Zhao , X. , Zhang , Y. , Jain , P. U. , and Stumm , M . Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-intensive Systems . In Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation (OSDI'14) ( Oct. 2014 ). Yuan, D., Luo, Y., Zhuang, X., Rodrigues, G., Zhao, X., Zhang, Y., Jain, P. U., and Stumm, M. Simple Testing Can Prevent Most Critical Failures: An Analysis of Production Failures in Distributed Data-intensive Systems. In Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation (OSDI'14) (Oct. 2014)."},{"key":"e_1_3_2_1_121_1","volume-title":"Proceedings of the 25th USENIX Security Symposium (USENIX Security '16)","author":"Yun I.","year":"2016","unstructured":"Yun , I. , Min , C. , Si , X. , Jang , Y. , Kim , T. , and Naik , M . APISan: Sanitizing API Usages through Semantic Cross-Checking . In Proceedings of the 25th USENIX Security Symposium (USENIX Security '16) ( Aug. 2016 ). Yun, I., Min, C., Si, X., Jang, Y., Kim, T., and Naik, M. APISan: Sanitizing API Usages through Semantic Cross-Checking. In Proceedings of the 25th USENIX Security Symposium (USENIX Security '16) (Aug. 2016)."},{"key":"e_1_3_2_1_122_1","volume-title":"Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI'12)","author":"Zaharia M.","year":"2012","unstructured":"Zaharia , M. , Chowdhury , M. , Das , T. , Dave , A. , Ma , J. , McCauley , M. , Franklin , M. J. , Shenker , S. , and Stoica , I . Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing . In Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI'12) ( Apr. 2012 ). Zaharia, M., Chowdhury, M., Das, T., Dave, A., Ma, J., McCauley, M., Franklin, M. J., Shenker, S., and Stoica, I. Resilient Distributed Datasets: A Fault-Tolerant Abstraction for In-Memory Cluster Computing. In Proceedings of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI'12) (Apr. 2012)."},{"key":"e_1_3_2_1_123_1","volume-title":"I. Automating the Debugging of Datacenter Applications with ADDA. In Proceedings of the 43rd Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN'13)","author":"Zamfir C.","year":"2013","unstructured":"Zamfir , C. , Altekar , G. , and Stoica :, I. Automating the Debugging of Datacenter Applications with ADDA. In Proceedings of the 43rd Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN'13) ( June 2013 ). Zamfir, C., Altekar, G., and Stoica:, I. Automating the Debugging of Datacenter Applications with ADDA. In Proceedings of the 43rd Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN'13) (June 2013)."},{"key":"e_1_3_2_1_124_1","volume-title":"Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20)","author":"Zhai E.","year":"2020","unstructured":"Zhai , E. , Chen , A. , Piskac , R. , Balakrishnan , M. , Tian , B. , Song , B. , and Zhang , H . Check before You Change: Preventing Correlated Failures in Service Updates . In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) ( Feb. 2020 ). Zhai, E., Chen, A., Piskac, R., Balakrishnan, M., Tian, B., Song, B., and Zhang, H. Check before You Change: Preventing Correlated Failures in Service Updates. In Proceedings of the 17th USENIX Symposium on Networked Systems Design and Implementation (NSDI'20) (Feb. 2020)."},{"key":"e_1_3_2_1_125_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541983"},{"key":"e_1_3_2_1_126_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477132.3483577"}],"event":{"name":"EuroSys '23: Eighteenth European Conference on Computer Systems","location":"Rome Italy","acronym":"EuroSys '23","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Eighteenth European Conference on Computer Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3587448","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3552326.3587448","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3552326.3587448","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:47:40Z","timestamp":1750178860000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3552326.3587448"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,8]]},"references-count":126,"alternative-id":["10.1145\/3552326.3587448","10.1145\/3552326"],"URL":"https:\/\/doi.org\/10.1145\/3552326.3587448","relation":{},"subject":[],"published":{"date-parts":[[2023,5,8]]},"assertion":[{"value":"2023-05-08","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}