{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T16:53:22Z","timestamp":1781196802167,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":14,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3774899.3775015","type":"proceedings-article","created":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T14:35:38Z","timestamp":1765290938000},"page":"26-30","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["GPU Tail Latency Diagnosis for Serverless and HPC Workloads using eBPF"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6003-0119","authenticated-orcid":false,"given":"Erfan","family":"Darzi","sequence":"first","affiliation":[{"name":"Harvard, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-7562-611X","authenticated-orcid":false,"given":"Aldo","family":"Pareja","sequence":"additional","affiliation":[{"name":"MIT\/IBM, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-7193-8022","authenticated-orcid":false,"given":"Kaveh","family":"Jalilian","sequence":"additional","affiliation":[{"name":"Northeastern, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-5574-9045","authenticated-orcid":false,"given":"Shreeanant","family":"Bharadwaj","sequence":"additional","affiliation":[{"name":"Northeastern, Boston, MA, USA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24)","author":"Agrawal Amey","year":"2024","unstructured":"Amey Agrawal, Nitin Kedia, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav Gulavani, Alexey Tumanov, and Ramachandran Ramjee. 2024. Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24). USENIX Association, Santa Clara, CA, 41\u201361. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/agrawal"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.3390\/fi16030072"},{"key":"e_1_3_2_1_3_1","volume-title":"XPUTIMER: Anomaly Diagnostics for Divergent LLM Training in GPU Clusters of Thousand-Plus Scale. arXiv preprint arXiv:2502.05413 (February","author":"Cui Weihao","year":"2025","unstructured":"Weihao Cui, Ji Zhang, Han Zhao, Chao Liu, Wenhao Zhang, Jian Sha, Quan Chen, Bingsheng He, and Minyi Guo. 2025. XPUTIMER: Anomaly Diagnostics for Divergent LLM Training in GPU Clusters of Thousand-Plus Scale. arXiv preprint arXiv:2502.05413 (February 2025)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPCCC56509.2022.9977665"},{"key":"e_1_3_2_1_5_1","unstructured":"Zhiyi Hu Siyuan Shen Tommaso Bonato Sylvain Jeaugey Cedell Alexander Eric Spada James Dinan Jeff Hammond and Torsten Hoefler. 2025. Demystifying NCCL: An In-depth Analysis of GPU Communication Protocols and Algorithms. arXiv:2507.04786 [cs.DC]"},{"key":"e_1_3_2_1_6_1","volume-title":"Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of Large-Scale Multi-Tenant GPU Clusters for DNN Training Workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX Association, Renton, WA, 947\u2013960. https:\/\/www.usenix.org\/conference\/atc19\/presentation\/jeon"},{"key":"e_1_3_2_1_7_1","unstructured":"C.K. Luk and Lei Tian. 2022. Performance Debugging of Production PyTorch Models at Meta. PyTorch Blog. https:\/\/pytorch.org\/blog\/performance-debugging-of-production-pytorch-models-at-meta\/"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3627703.3629578"},{"key":"e_1_3_2_1_9_1","volume-title":"Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24)","author":"Sun Biao","year":"2024","unstructured":"Biao Sun, Ziming Huang, Hanyu Zhao, Wencong Xiao, Xinyi Zhang, Yong Li, and Wei Lin. 2024. Llumnix: Dynamic Scheduling for Large Language Model Serving. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI '24). USENIX Association, Santa Clara, CA, 63\u201381. https:\/\/www.usenix.org\/conference\/osdi24\/presentation\/sun-biao"},{"key":"e_1_3_2_1_10_1","unstructured":"Tingting Wang and Guilin Qi. 2024. A Comprehensive Survey on Root Cause Analysis in (Micro) Services: Methodologies Challenges and Trends. arXiv:2408.00803 [cs.SE]"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 4th Workshop on Heterogeneous Composable and Disaggregated Systems. 73\u201379","author":"Yang Yiwei","year":"2025","unstructured":"Yiwei Yang, Tong Yu, Yusheng Zheng, and Andrew Quinn. 2025. eGPU: Extending eBPF Programmability and Observability to GPUs. In Proceedings of the 4th Workshop on Heterogeneous Composable and Disaggregated Systems. 73\u201379."},{"key":"e_1_3_2_1_12_1","volume-title":"Tally: Non-Intrusive Performance Isolation for Concurrent Deep Learning Workloads. arXiv preprint arXiv:2410.07381","author":"Zhao Wei","year":"2024","unstructured":"Wei Zhao, Anand Jayarajan, and Gennady Pekhimenko. 2024. Tally: Non-Intrusive Performance Isolation for Concurrent Deep Learning Workloads. arXiv preprint arXiv:2410.07381 (2024). arXiv:2410.07381 [cs.DC]"},{"key":"e_1_3_2_1_13_1","volume-title":"20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23)","author":"Zhou Yang","year":"2023","unstructured":"Yang Zhou, Zezhou Wang, Sowmya Dharanipragada, and Minlan Yu. 2023. Electrode: Accelerating Distributed Protocols with eBPF. In 20th USENIX Symposium on Networked Systems Design and Implementation (NSDI 23). USENIX Association, Boston, MA, 1391\u20131407. https:\/\/www.usenix.org\/conference\/nsdi23\/presentation\/zhou"},{"key":"e_1_3_2_1_14_1","volume-title":"21st USENIX Symposium on Networked Systems Design and Implementation (NSDI '24)","author":"Zhou Yang","year":"2024","unstructured":"Yang Zhou, Xingyu Xiang, Matthew Kiley, Sowmya Dharanipragada, and Minlan Yu. 2024. Dint: Fast In-Kernel Distributed Transactions with eBPF. In 21st USENIX Symposium on Networked Systems Design and Implementation (NSDI '24). USENIX Association, Santa Clara, CA. https:\/\/www.usenix.org\/conference\/nsdi24\/presentation\/zhou-yang"}],"event":{"name":"WoSC11 '25: 11th International Workshop on Serverless Computing","location":"Vanderbilt University Nashville TN USA","acronym":"WoSC11 '25","sponsor":["IFIP","Usenix"]},"container-title":["Proceedings of the 11th International Workshop on Serverless Computing"],"original-title":[],"deposited":{"date-parts":[[2025,12,9]],"date-time":"2025-12-09T14:35:52Z","timestamp":1765290952000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3774899.3775015"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":14,"alternative-id":["10.1145\/3774899.3775015","10.1145\/3774899"],"URL":"https:\/\/doi.org\/10.1145\/3774899.3775015","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}