{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,23]],"date-time":"2026-04-23T07:54:20Z","timestamp":1776930860585,"version":"3.51.2"},"publisher-location":"New York, NY, USA","reference-count":37,"publisher":"ACM","license":[{"start":{"date-parts":[[2026,11,15]],"date-time":"2026-11-15T00:00:00Z","timestamp":1794700800000},"content-version":"vor","delay-in-days":365,"URL":"http:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["2230079, 2230077, 2431425"],"award-info":[{"award-number":["2230079, 2230077, 2431425"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3731599.3767385","type":"proceedings-article","created":{"date-parts":[[2025,11,7]],"date-time":"2025-11-07T16:18:44Z","timestamp":1762532324000},"page":"393-400","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["The Cost of Teaching Operational ML"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9897-9282","authenticated-orcid":false,"given":"Fraida","family":"Fund","sequence":"first","affiliation":[{"name":"Tandon School of Engineering, New York University, Brooklyn, New York, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5251-5466","authenticated-orcid":false,"given":"Kate","family":"Keahey","sequence":"additional","affiliation":[{"name":"Argonne National Laboratory (ANL), Lemont, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1516-8562","authenticated-orcid":false,"given":"Cody","family":"Hammock","sequence":"additional","affiliation":[{"name":"Texas Advanced Computing Center (TACC), Austin, Texas, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3532-4281","authenticated-orcid":false,"given":"Marc","family":"Richardson","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9002-6586","authenticated-orcid":false,"given":"Mark","family":"Powers","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0352-8842","authenticated-orcid":false,"given":"Michael","family":"Sherman","sequence":"additional","affiliation":[{"name":"University of Chicago, Chicago, Illinois, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","volume-title":"Driving Autonomous Cars From Edge to Cloud with CHI@Edge","author":"Anderson Richard","year":"2022","unstructured":"Richard Anderson. 2022. Driving Autonomous Cars From Edge to Cloud with CHI@Edge. https:\/\/chameleoncloud.org\/blog\/2022\/12\/19\/driving-autonomous-cars-from-edge-to-cloud-with-chiedge\/"},{"key":"e_1_3_3_2_3_2","doi-asserted-by":"crossref","unstructured":"Ilya Baldin Anita Nikolich James Griffioen Indermohan Inder\u00a0S Monga Kuang-Ching Wang Tom Lehman and Paul Ruth. 2020. Fabric: A national-scale programmable experimental network infrastructure. IEEE Internet Computing 23 6 (2020) 38\u201347.","DOI":"10.1109\/MIC.2019.2958545"},{"key":"e_1_3_3_2_4_2","doi-asserted-by":"publisher","DOI":"10.1109\/BigData.2017.8258038"},{"key":"e_1_3_3_2_5_2","volume-title":"Presentation for mini-symposium on education using Chameleon","author":"Canonico Massimo","year":"2023","unstructured":"Massimo Canonico. 2023. Presentation for mini-symposium on education using Chameleon. Video available in program listing at: https:\/\/chameleoncloud.org\/chameleon-cloud-users-meeting\/user-meeting-2023\/."},{"key":"e_1_3_3_2_6_2","volume-title":"Reproduce, Rerun, Repeat: The Fun Way to Learn Machine Learning!","author":"Shekhar\u00a0Pandey Priyanka\u00a0Bose Chandra","year":"2023","unstructured":"Priyanka\u00a0Bose Chandra Shekhar\u00a0Pandey. 2023. Reproduce, Rerun, Repeat: The Fun Way to Learn Machine Learning!https:\/\/www.chameleoncloud.org\/blog\/2023\/03\/28\/reproduce-rerun-repeat-the-fun-way-to-learn-machine-learning\/"},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3399579.3399867"},{"key":"e_1_3_3_2_8_2","volume-title":"Proceedings of the 37th International Conference on Neural Information Processing Systems","author":"Dettmers Tim","year":"2023","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023. QLORA: efficient finetuning of quantized LLMs. In Proceedings of the 37th International Conference on Neural Information Processing Systems."},{"key":"e_1_3_3_2_9_2","volume-title":"Using Chameleon for HPC Education: an OpenMP Tutorial","author":"Diaz Jose Manuel\u00a0Monsalve","year":"2023","unstructured":"Jose Manuel\u00a0Monsalve Diaz. 2023. Using Chameleon for HPC Education: an OpenMP Tutorial. https:\/\/www.chameleoncloud.org\/blog\/2023\/05\/30\/jose-monsalve-education-user-blog-post-interview\/"},{"key":"e_1_3_3_2_10_2","volume-title":"Proceedings of the USENIX Annual Technical Conference (Usenix ATC \u201919\u2019)","author":"Duplyakin Dmitry","year":"2019","unstructured":"Dmitry Duplyakin, Robert Ricci, Aleksander Maricq, Gary Wong, Jonathon Duerig, Eric Eide, Leigh Stoller, Mike Hibler, David Johnson, Kirk Webb, Aditya Akella, Kuangching Wang, Glenn Ricart, Larry Landweber, Chip Elliott, Michael Zink, Emmanuel Cecchet, Snigdhaswin Kar, and Prabodh Mishra. 2019. The Design and Operation of CloudLab. In Proceedings of the USENIX Annual Technical Conference (Usenix ATC \u201919\u2019)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1145\/3624062.3624101"},{"key":"e_1_3_3_2_12_2","unstructured":"Fraida Fund. 2025. Machine Learning Systems Engineering and Operations. https:\/\/ffund.github.io\/ml-sys-ops\/. Accessed: 2025-08-01."},{"key":"e_1_3_3_2_13_2","unstructured":"Andrew Gibiansky. 2017. Bringing HPC Techniques to Deep Learning. https:\/\/andrew.gibiansky.com\/blog\/machine-learning\/baidu-allreduce\/ Originally published on Baidu Research technical blog."},{"key":"e_1_3_3_2_14_2","volume-title":"Educating with Chameleon at Vanderbilt","author":"Gokhale Aniruddha","year":"2023","unstructured":"Aniruddha Gokhale. 2023. Educating with Chameleon at Vanderbilt. https:\/\/www.chameleoncloud.org\/blog\/2023\/07\/17\/educating-with-chameleon-at-vanderbilt\/"},{"key":"e_1_3_3_2_15_2","volume-title":"International Conference on Learning Representations (ICLR)","author":"Hu Edward\u00a0J.","year":"2022","unstructured":"Edward\u00a0J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. 2022. LoRA: Low-Rank Adaptation of Large Language Models. In International Conference on Learning Representations (ICLR)."},{"key":"e_1_3_3_2_16_2","doi-asserted-by":"publisher","DOI":"10.1145\/3377814.3381714"},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1109\/e-Science58273.2023.10254846"},{"key":"e_1_3_3_2_18_2","volume-title":"Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC \u201920)","author":"Keahey Kate","year":"2020","unstructured":"Kate Keahey, Jason Anderson, Zhuo Zhen, Pierre Riteau, Paul Ruth, Dan Stanzione, Mert Cevik, Jacob Colleran, Haryadi\u00a0S. Gunawi, Cody Hammock, Joe Mambretti, Alexander Barnes, Fran\u00e7ois Halbach, Alex Rocha, and Joe Stubbs. 2020. Lessons Learned from the Chameleon Testbed. In Proceedings of the 2020 USENIX Annual Technical Conference (USENIX ATC \u201920)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/CLOUD.2019.00019"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"publisher","DOI":"10.1145\/3708035.3736014"},{"key":"e_1_3_3_2_21_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICSE-SEET58685.2023.00015"},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"publisher","unstructured":"Filippo Lanubile Silverio Mart\u00ednez-Fern\u00e1ndez and Luigi Quaranta. 2024. Training Future Machine Learning Engineers: A Project-Based Course on MLOps. IEEE Software 41 2 (2024) 60\u201367. 10.1109\/MS.2023.3310768","DOI":"10.1109\/MS.2023.3310768"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","unstructured":"Shen Li Yanli Zhao Rohan Varma Omkar Salpekar Pieter Noordhuis Teng Li Adam Paszke Jeff Smith Brian Vaughan Pritam Damania and Soumith Chintala. 2020. PyTorch Distributed: Experiences on Accelerating Data Parallel Training. Proc. VLDB Endow. 13 12 (2020) 3005\u20133018. 10.14778\/3415478.3415530","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_3_2_24_2","first-page":"561","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201918)","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael\u00a0I. Jordan, and Ion Stoica. 2018. Ray: A Distributed Framework for Emerging AI Applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI \u201918). 561\u2013577."},{"key":"e_1_3_3_2_25_2","volume-title":"51st Euromicro Conference Series on Software Engineering and Advanced Applications (SEAA)","author":"Palomba Fabio","year":"2025","unstructured":"Fabio Palomba, Giuseppe Voria, Antonio Parziale, Vincenzo Pentangelo, Antonio\u00a0Della Porta, Valerio\u00a0De Martino, Giuseppe Recupito, and Gennaro Giordano. 2025. Teaching Software Engineering for Artificial Intelligence: An Experience Report. In 51st Euromicro Conference Series on Software Engineering and Advanced Applications (SEAA). Euromicro, Salerno, Italy. Experience Report."},{"key":"e_1_3_3_2_26_2","volume-title":"Proceedings of the USENIX Conference on Operational Machine Learning (OpML)","author":"Papasian Daniel","year":"2020","unstructured":"Daniel Papasian and Todd Underwood. 2020. How ML Breaks: A Decade of Outages for One Large ML Pipeline. In Proceedings of the USENIX Conference on Operational Machine Learning (OpML)."},{"key":"e_1_3_3_2_27_2","doi-asserted-by":"publisher","unstructured":"Pitch Patarasuk and Xin Yuan. 2009. Bandwidth Optimal All-Reduce Algorithms for Clusters of Workstations. J. Parallel and Distrib. Comput. 69 2 (2009) 117\u2013124. 10.1016\/j.jpdc.2008.09.002","DOI":"10.1016\/j.jpdc.2008.09.002"},{"key":"e_1_3_3_2_28_2","unstructured":"Alexander Ratner Dan Alistarh Gustavo Alonso David\u00a0G. Andersen Peter Bailis Sarah Bird Nicholas Carlini Bryan Catanzaro Jennifer Chayes Eric Chung Bill Dally Jeff Dean Inderjit\u00a0S. Dhillon Alexandros Dimakis Pradeep Dubey Charles Elkan Grigori Fursin Gregory\u00a0R. Ganger Lise Getoor Phillip\u00a0B. Gibbons Garth\u00a0A. Gibson Joseph\u00a0E. Gonzalez Justin Gottschlich Song Han Kim Hazelwood Furong Huang Martin Jaggi Kevin Jamieson Michael\u00a0I. Jordan Gauri Joshi Rania Khalaf Jason Knight Jakub Kone\u010dn\u00fd Tim Kraska Arun Kumar Anastasios Kyrillidis Aparna Lakshmiratan Jing Li Samuel Madden H.\u00a0Brendan McMahan Erik Meijer Ioannis Mitliagkas Rajat Monga Derek Murray Kunle Olukotun Dimitris Papailiopoulos Gennady Pekhimenko Theodoros Rekatsinas Afshin Rostamizadeh Christopher R\u00e9 Christopher\u00a0De Sa Hanie Sedghi Siddhartha Sen Virginia Smith Alex Smola Dawn Song Evan Sparks Ion Stoica Vivienne Sze Madeleine Udell Joaquin Vanschoren Shivaram Venkataraman Rashmi Vinayak Markus Weimer Andrew\u00a0Gordon Wilson Eric Xing Matei Zaharia Ce Zhang and Ameet Talwalkar. 2019. MLSys: The New Frontier of Machine Learning Systems. arxiv:https:\/\/arXiv.org\/abs\/1904.03257\u00a0[cs.LG] https:\/\/arxiv.org\/abs\/1904.03257"},{"key":"e_1_3_3_2_29_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.442"},{"key":"e_1_3_3_2_30_2","volume-title":"Presentation for mini-symposium on education using Chameleon","author":"Rieffel John","year":"2023","unstructured":"John Rieffel. 2023. Presentation for mini-symposium on education using Chameleon. Video available in program listing at: https:\/\/chameleoncloud.org\/chameleon-cloud-users-meeting\/user-meeting-2023\/."},{"key":"e_1_3_3_2_31_2","volume-title":"Practitioners Guide to MLOps: A Framework for Continuous Delivery and Automation of Machine Learning","author":"Salama Khalid","year":"2021","unstructured":"Khalid Salama, Jarek Kazmierczak, and Donna Schut. 2021. Practitioners Guide to MLOps: A Framework for Continuous Delivery and Automation of Machine Learning. Technical Report. Google Cloud. https:\/\/cloud.google.com\/architecture\/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning"},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3491418.3535153"},{"key":"e_1_3_3_2_33_2","first-page":"2503","volume-title":"Advances in Neural Information Processing Systems 28 (NeurIPS)","author":"Sculley D.","year":"2015","unstructured":"D. Sculley, Gary Holt, Daniel Golovin, Eugene Davydov, Todd Phillips, Dietmar Ebner, Vinay Chaudhary, Michael Young, Jean-Francois Crespo, and Dan Dennison. 2015. Hidden Technical Debt in Machine Learning Systems. In Advances in Neural Information Processing Systems 28 (NeurIPS). MIT Press, 2503\u20132511."},{"key":"e_1_3_3_2_34_2","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-22941-1_41"},{"key":"e_1_3_3_2_35_2","volume-title":"Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI \u201922)","author":"Weng Qizhen","year":"2022","unstructured":"Qizhen Weng, Wencong Xiao, Yinghao Yu, Wei Wang, Cheng Wang, Jian He, Yong Li, Liping Zhang, Wei Lin, and Yu Ding. 2022. MLaaS in the Wild: Workload Analysis and Scheduling in Large-Scale Heterogeneous GPU Clusters. In Proceedings of the 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI \u201922)."},{"key":"e_1_3_3_2_36_2","unstructured":"Matei Zaharia Andrew Chen Aaron Davidson Ali Ghodsi Sue\u00a0Ann Hong Andy Konwinski Siddharth Murching Tomas Nykodym Paul Ogilvie Mani Parkhe Fen Xie and Corey Zumar. 2018. Accelerating the Machine Learning Lifecycle with MLflow. Bulletin of the IEEE Computer Society Technical Committee on Data Engineering (2018)."},{"key":"e_1_3_3_2_37_2","unstructured":"Susan Zhang. 2023. Trials of developing OPT-175B. Stanford MLSys Seminar Episode 77 YouTube. https:\/\/www.youtube.com\/watch?v=p9IxoSkvZ-M"},{"key":"e_1_3_3_2_38_2","doi-asserted-by":"publisher","unstructured":"Yanli Zhao Andrew Gu Rohan Varma Liang Luo Chien-Chin Huang Min Xu Less Wright Hamid Shojanazeri Myle Ott Sam Shleifer Alban Desmaison Can Balioglu Pritam Damania Bernard Nguyen Geeta Chauhan Yuchen Hao Ajit Mathews and Shen Li. 2023. PyTorch FSDP: Experiences on Scaling Fully Sharded Data Parallel. Proc. VLDB Endow. 16 12 (2023) 3848\u20133860. arxiv:https:\/\/arXiv.org\/abs\/2306.1676810.14778\/3611540.3611569","DOI":"10.14778\/3611540.3611569"}],"event":{"name":"SC Workshops '25: Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St Louis MO USA","acronym":"SC Workshops '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the SC '25 Workshops of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3731599.3767385","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767385","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3731599.3767385","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,9]],"date-time":"2026-01-09T19:35:11Z","timestamp":1767987311000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3731599.3767385"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":37,"alternative-id":["10.1145\/3731599.3767385","10.1145\/3731599"],"URL":"https:\/\/doi.org\/10.1145\/3731599.3767385","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}