{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T10:20:58Z","timestamp":1777458058390,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2026,5,4]]},"DOI":"10.1145\/3777911.3800637","type":"proceedings-article","created":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T13:00:11Z","timestamp":1776949211000},"page":"163-172","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Beyond Microservices: Testing Web-Scale RCA Methods on GPU-Driven LLM Workloads"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0763-3233","authenticated-orcid":false,"given":"Dominik","family":"Scheinert","sequence":"first","affiliation":[{"name":"logsight.ai GmbH, Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0108-3034","authenticated-orcid":false,"given":"Alexander","family":"Acker","sequence":"additional","affiliation":[{"name":"logsight.ai GmbH, Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5154-7813","authenticated-orcid":false,"given":"Thorsten","family":"Wittkopp","sequence":"additional","affiliation":[{"name":"Technische Universit\u00e4t Berlin, Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6487-1268","authenticated-orcid":false,"given":"Soeren","family":"Becker","sequence":"additional","affiliation":[{"name":"logsight.ai GmbH, Berlin, Germany"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2078-0668","authenticated-orcid":false,"given":"Hamza","family":"Yous","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5699-4089","authenticated-orcid":false,"given":"Karnakar","family":"Reddy","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5478-7799","authenticated-orcid":false,"given":"Ibrahim","family":"Farhat","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2265-9343","authenticated-orcid":false,"given":"Hakim","family":"Hacid","sequence":"additional","affiliation":[{"name":"Technology Innovation Institute, Abu Dhabi, United Arab Emirates"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6454-6799","authenticated-orcid":false,"given":"Odej","family":"Kao","sequence":"additional","affiliation":[{"name":"Technische Universit\u00e4t Berlin, Berlin, Germany"}]}],"member":"320","published-online":{"date-parts":[[2026,5,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE48619.2023.00149"},{"key":"e_1_3_2_1_2_1","volume-title":"Salesforce CausalAI Library: A Fast and Scalable Framework for Causal Analysis of Time Series and Tabular Data. CoRR","author":"Arpit Devansh","year":"2023","unstructured":"Devansh Arpit, Matthew Fernandez, Chenghao Liu, Weiran Yao, Wenzhuo Yang, Paul Josel, Shelby Heinecke, Eric Hu, Huan Wang, Stephen C. H. Hoi, Caiming Xiong, Kun Zhang, and Juan Carlos Niebles. 2023. Salesforce CausalAI Library: A Fast and Scalable Framework for Causal Analysis of Time Series and Tabular Data. CoRR, Vol. abs\/2301.10859 (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Dependable computing: From concepts to design diversity. Proc","author":"Avizienis Algirdas","year":"1986","unstructured":"Algirdas Avizienis and Jean-Claude Laprie. 1986. Dependable computing: From concepts to design diversity. Proc. IEEE, Vol. 74, 5 (1986)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/MS.2016.60"},{"key":"e_1_3_2_1_5_1","volume-title":"Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael I. Jordan, Joseph E. Gonzalez, and Ion Stoica.","author":"Chiang Wei-Lin","year":"2024","unstructured":"Wei-Lin Chiang, Lianmin Zheng, Ying Sheng, Anastasios Nikolas Angelopoulos, Tianle Li, Dacheng Li, Banghua Zhu, Hao Zhang, Michael I. Jordan, Joseph E. Gonzalez, and Ion Stoica. 2024. Chatbot Arena: An Open Platform for Evaluating LLMs by Human Preference. In ICML. OpenReview.net."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2015.11.021"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304004"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1016\/0165-1889(80)90069-X"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPA-BDCloud-SocialCom-SustainCom52081.2021.00074"},{"key":"e_1_3_2_1_10_1","volume-title":"Saurabh Bagchi, and Murat Kocaoglu.","author":"Ikram Azam","year":"2022","unstructured":"Azam Ikram, Sarthak Chakraborty, Subrata Mitra, Shiv Kumar Saini, Saurabh Bagchi, and Murat Kocaoglu. 2022. Root Cause Analysis of Failures in Microservices through Causal Discovery. In NeurIPS."},{"key":"e_1_3_2_1_11_1","volume-title":"The Foundation Cracks: A Comprehensive Study on Bugs and Testing Practices in LLM Libraries. CoRR","author":"Jiang Weipeng","year":"2025","unstructured":"Weipeng Jiang, Xiaoyu Zhang, Xiaofei Xie, Jiongchi Yu, Yuhan Zhi, Shiqing Ma, and Chao Shen. 2025. The Foundation Cracks: A Comprehensive Study on Bugs and Testing Practices in LLM Libraries. CoRR, Vol. abs\/2506.12320 (2025)."},{"key":"e_1_3_2_1_12_1","unstructured":"Myunghwan Kim Roshan Sumbaly and Sam Shah. 2013. Root cause detection in a service-oriented architecture. In SIGMETRICS. ACM."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA61900.2025.00096"},{"key":"e_1_3_2_1_14_1","volume-title":"Joseph Gonzalez, Hao Zhang, and Ion Stoica.","author":"Kwon Woosuk","year":"2023","unstructured":"Woosuk Kwon, Zhuohan Li, Siyuan Zhuang, Ying Sheng, Lianmin Zheng, Cody Hao Yu, Joseph Gonzalez, Hao Zhang, and Ion Stoica. 2023. Efficient Memory Management for Large Language Model Serving with PagedAttention. In SOSP. ACM."},{"key":"e_1_3_2_1_15_1","unstructured":"Mingjie Li Zeyan Li Kanglin Yin Xiaohui Nie Wenchi Zhang Kaixin Sui and Dan Pei. 2022. Causal Inference-Based Root Cause Analysis for Online Service Systems with Intervention Recognition. In SIGKDD. ACM."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQOS52092.2021.9521340"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/SSE62657.2024.00030"},{"key":"e_1_3_2_1_18_1","volume-title":"Microscope: Pinpoint Performance Issues with Causal Graphs in Micro-service Environments. In ICSOC","author":"Lin JinJin","year":"2018","unstructured":"JinJin Lin, Pengfei Chen, and Zibin Zheng. 2018. Microscope: Pinpoint Performance Issues with Causal Graphs in Micro-service Environments. In ICSOC, Vol. 11236. Springer."},{"key":"e_1_3_2_1_19_1","volume-title":"A First Look at Bugs in LLM Inference Engines. CoRR","author":"Liu Mugeng","year":"2025","unstructured":"Mugeng Liu, Siqi Zhong, Weichen Bi, Yixuan Zhang, Zhiyang Chen, Zhenpeng Chen, Xuanzhe Liu, and Yun Ma. 2025. A First Look at Bugs in LLM Inference Engines. CoRR, Vol. abs\/2506.09713 (2025)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/IWQoS49365.2020.9213058"},{"key":"e_1_3_2_1_21_1","volume-title":"Computing","volume":"107","author":"Moreschini Sergio","year":"2025","unstructured":"Sergio Moreschini, Shahrzad Pour, Ivan Lanese, Daniel Balouek, Justus Bogner, Xiaozhou Li, Fabiano Pecorelli, Jacopo Soldani, Eddy Truyen, and Davide Taibi. 2025. AI Techniques in the Microservices Life-Cycle: a Systematic Mapping Study. Computing, Vol. 107, 4 (2025)."},{"key":"e_1_3_2_1_22_1","volume-title":"Ray: A Distributed Framework for Emerging AI Applications","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I. Jordan, and Ion Stoica. 2018. Ray: A Distributed Framework for Emerging AI Applications. In OSDI. USENIX Association."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.artmed.2024.102900"},{"key":"e_1_3_2_1_24_1","unstructured":"Changhua Pei Zexin Wang Fengrui Liu Zeyan Li Yang Liu Xiao He Rong Kang Tieying Zhang Jianjun Chen Jianhui Li Gaogang Xie and Dan Pei. 2025. Flow-of-Action: SOP Enhanced LLM-Based Multi-Agent System for Root Cause Analysis. In WWW. ACM."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3660805"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"crossref","unstructured":"Luan Pham Hongyu Zhang Huong Ha Flora Salim and Xiuzhen Zhang. 2025. RCAEval: A Benchmark for Root Cause Analysis of Microservice Systems with Telemetry Data. In WWW. ACM.","DOI":"10.1145\/3701716.3715290"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"crossref","unstructured":"Huasong Shan Yuan Chen Haifeng Liu Yunpeng Zhang Xiao Xiao Xiaofeng He Min Li and Wei Ding. 2019. \u03b5-Diagnosis: Unsupervised and Real-time Diagnosis of Small- window Long-tail Latency in Large-scale Microservice Platforms. In WWW. ACM.","DOI":"10.1145\/3308558.3313653"},{"key":"e_1_3_2_1_28_1","volume-title":"Cost-Effective Large Language Model Inference Infrastructure. CoRR","author":"Shan Jiaxin","year":"2025","unstructured":"Jiaxin Shan, Varun Gupta, Le Xu, Haiyang Shi, Jingyuan Zhang, Ning Wang, Linhui Xu, Rong Kang, Tongping Liu, Yifei Zhang, Yiqing Zhu, Shuowei Jin, Gangmuk Lim, Binbin Chen, Zuzhi Chen, Xiao Liu, Xin Chen, Kante Yin, Chak-Pong Chung, Chenyu Jiang, Yicheng Lu, Jianjun Chen, Caixue Lin, Wu Xiang, Rui Shi, and Liguang Xie. 2025. AIBrix: Towards Scalable, Cost-Effective Large Language Model Inference Infrastructure. CoRR, Vol. abs\/2504.03648 (2025)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"crossref","unstructured":"Junxian Shen Han Zhang Yang Xiang Xingang Shi Xinrui Li Yunxi Shen Zijian Zhang Yongxiang Wu Xia Yin Jilong Wang Mingwei Xu Yahui Li Jiping Yin Jianchang Song Zhuofeng Li and Runjie Nie. 2023. Network-Centric Distributed Tracing with DeepFlow: Troubleshooting Your Microservices in Zero Code. In SIGCOMM. ACM.","DOI":"10.1145\/3603269.3604823"},{"key":"e_1_3_2_1_30_1","unstructured":"Ying Sheng Shiyi Cao Dacheng Li Coleman Hooper Nicholas Lee Shuo Yang Christopher Chou Banghua Zhu Lianmin Zheng Kurt Keutzer Joseph Gonzalez and Ion Stoica. 2024. SLoRA: Scalable Serving of Thousands of LoRA Adapters. In MLSys. mlsys.org."},{"key":"e_1_3_2_1_31_1","volume-title":"Kerminen","author":"Shimizu Shohei","year":"2006","unstructured":"Shohei Shimizu, Patrik O. Hoyer, Aapo Hyv\u00e4rinen, and Antti J. Kerminen. 2006. A Linear Non-Gaussian Acyclic Model for Causal Discovery. J. Mach. Learn. Res., Vol. 7 (2006)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCSS.2024.3404039"},{"key":"e_1_3_2_1_33_1","volume-title":"Prediction, and Search","author":"Spirtes Peter","unstructured":"Peter Spirtes, Clark Glymour, and Richard Scheines. 2000. Causation, Prediction, and Search, Second Edition. MIT Press."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Abhishek Verma Luis Pedrosa Madhukar Korupolu David Oppenheimer Eric Tune and John Wilkes. 2015. Large-scale cluster management at Google with Borg. In EuroSys. ACM.","DOI":"10.1145\/2741948.2741964"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"crossref","unstructured":"Zefan Wang Zichuan Liu Yingying Zhang Aoxiao Zhong Jihong Wang Fengbin Yin Lunting Fan Lingfei Wu and Qingsong Wen. 2024. RCAgent: Cloud Root Cause Analysis by Autonomous Agents with Tool-Augmented Large Language Models. In CIKM. ACM.","DOI":"10.1145\/3627673.3680016"},{"key":"e_1_3_2_1_36_1","volume-title":"Quoc V. Le, and Denny Zhou.","author":"Wei Jason","year":"2022","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed H. Chi, Quoc V. Le, and Denny Zhou. 2022. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. In NeurIPS."},{"key":"e_1_3_2_1_37_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush.","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. 2020. Transformers: State-of-the-Art Natural Language Processing. In EMNLP. Association for Computational Linguistics."},{"key":"e_1_3_2_1_38_1","volume-title":"MicroRCA: Root Cause Localization of Performance Issues in Microservices","author":"Wu Li","unstructured":"Li Wu, Johan Tordsson, Erik Elmroth, and Odej Kao. 2020. MicroRCA: Root Cause Localization of Performance Issues in Microservices. In NOMS. IEEE."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jss.2023.111724"},{"key":"e_1_3_2_1_40_1","unstructured":"Junjielong Xu Qinan Zhang Zhiqing Zhong Shilin He Chaoyun Zhang Qingwei Lin Dan Pei Pinjia He Dongmei Zhang and Qi Zhang. 2025. OpenRCA: Can Large Language Models Locate the Root Cause of Software Failures?. In ICLR. OpenReview.net."},{"key":"e_1_3_2_1_41_1","unstructured":"Guangba Yu Pengfei Chen Hongyang Chen Zijie Guan Zicheng Huang Linxiao Jing Tianjun Weng Xinmeng Sun and Xiaoyun Li. 2021. MicroRank: End-to-End Latency Issue Localization with Extended Spectrum Analysis in Microservice Environments. In WWW. ACM \/ IW3C2."},{"key":"e_1_3_2_1_42_1","volume-title":"Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun.","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In OSDI. USENIX Association."},{"key":"e_1_3_2_1_43_1","volume-title":"A Survey on Failure Analysis and Fault Injection in AI Systems. CoRR","author":"Yu Guangba","year":"2024","unstructured":"Guangba Yu, Gou Tan, Haojia Huang, Zhenyu Zhang, Pengfei Chen, Roberto Natella, and Zibin Zheng. 2024. A Survey on Failure Analysis and Fault Injection in AI Systems. CoRR, Vol. abs\/2407.00125 (2024)."}],"event":{"name":"ICPE '26: 17th ACM\/SPEC International Conference on Performance Engineering","location":"Florence Italy","sponsor":["SIGSOFT ACM Special Interest Group on Software Engineering","SIGMETRICS ACM Special Interest Group on Measurement and Evaluation","SPEC"]},"container-title":["Companion of the 17th ACM\/SPEC International Conference on Performance Engineering"],"original-title":[],"deposited":{"date-parts":[[2026,4,28]],"date-time":"2026-04-28T13:06:12Z","timestamp":1777381572000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3777911.3800637"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5,3]]},"references-count":43,"alternative-id":["10.1145\/3777911.3800637","10.1145\/3777911"],"URL":"https:\/\/doi.org\/10.1145\/3777911.3800637","relation":{},"subject":[],"published":{"date-parts":[[2026,5,3]]},"assertion":[{"value":"2026-05-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}