{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T05:41:46Z","timestamp":1775281306512,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":43,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:00:00Z","timestamp":1747180800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,5,14]]},"DOI":"10.1145\/3713082.3730377","type":"proceedings-article","created":{"date-parts":[[2025,6,6]],"date-time":"2025-06-06T09:53:51Z","timestamp":1749203631000},"page":"218-224","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Towards Resource-Efficient Compound AI Systems"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1025-0009","authenticated-orcid":false,"given":"Gohar Irfan","family":"Chaudhry","sequence":"first","affiliation":[{"name":"MIT CSAIL"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0371-5522","authenticated-orcid":false,"given":"Esha","family":"Choukse","sequence":"additional","affiliation":[{"name":"Microsoft Azure Research -- Systems"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2591-4012","authenticated-orcid":false,"given":"\u00cd\u00f1igo","family":"Goiri","sequence":"additional","affiliation":[{"name":"Microsoft Azure Research -- Systems"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9662-2661","authenticated-orcid":false,"given":"Rodrigo","family":"Fonseca","sequence":"additional","affiliation":[{"name":"Microsoft Azure Research -- Systems"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2222-3611","authenticated-orcid":false,"given":"Adam","family":"Belay","sequence":"additional","affiliation":[{"name":"MIT CSAIL"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5971-5084","authenticated-orcid":false,"given":"Ricardo","family":"Bianchini","sequence":"additional","affiliation":[{"name":"Microsoft Azure"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2025,6,6]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amazon Web Services. 2025. Amazon Web Services (AWS). https:\/\/aws.amazon.com\/."},{"key":"e_1_3_2_1_2_1","unstructured":"Pradeep Ambati \u00cd\u00f1igo Goiri Felipe Frujeri Alper Gun Ke Wang Brian Dolan Brian Corell Sekhar Pasupuleti Thomas Moscibroda Sameh Elnikety et al. 2020. Providing SLOs for Resource-Harvesting VMs in cloud platforms. In OSDI."},{"key":"e_1_3_2_1_3_1","volume-title":"Bohou Li, Mark Lindblad, Henry Lindeman, Alex Meyer, Parth Parmar, Tanvi Ranade, Mehul A. Shah, Benjamin Sowell, Dan Tecuci, Vinayak Thapliyal, and Matt Welsh.","author":"Anderson Eric","year":"2024","unstructured":"Eric Anderson, Jonathan Fritz, Austin Lee, Bohou Li, Mark Lindblad, Henry Lindeman, Alex Meyer, Parth Parmar, Tanvi Ranade, Mehul A. Shah, Benjamin Sowell, Dan Tecuci, Vinayak Thapliyal, and Matt Welsh. 2024. The Design of an LLM-powered Unstructured Analytics System. arXiv:2409.00847 [cs.DB] https:\/\/arxiv.org\/abs\/2409.00847"},{"key":"e_1_3_2_1_4_1","unstructured":"The Kubernetes Authors. 2024. Kubernetes Documentation. https:\/\/kubernetes.io\/"},{"key":"e_1_3_2_1_5_1","unstructured":"Microsoft Azure. 2024. Azure Machine Learning. https:\/\/azure.microsoft.com\/en-us\/products\/machine-learning"},{"key":"e_1_3_2_1_6_1","unstructured":"Microsoft Azure. 2024. What is provisioned throughput? https:\/\/learn.microsoft.com\/en-us\/azure\/ai-services\/openai\/concepts\/provisioned-throughput"},{"key":"e_1_3_2_1_7_1","unstructured":"Microsoft Azure. 2024. What is provisioned throughput? https:\/\/azure.microsoft.com\/en-us\/products\/virtual-machines\/spot\/"},{"key":"e_1_3_2_1_8_1","unstructured":"Microsoft Azure. 2025. GPU Accelerated Virtual Machines: NDm A100 v4-Series. https:\/\/learn.microsoft.com\/en-us\/azure\/virtual-machines\/sizes\/gpu-accelerated\/ndma100v4-series."},{"key":"e_1_3_2_1_9_1","volume-title":"The OpenCV Library. Dr. Dobb's Journal of Software Tools","author":"Bradski G.","year":"2000","unstructured":"G. Bradski. 2000. The OpenCV Library. Dr. Dobb's Journal of Software Tools (2000)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/275487.275492"},{"key":"e_1_3_2_1_11_1","unstructured":"Lingjiao Chen Matei Zaharia and James Zou. 2023. FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance. arXiv:2305.05176 [cs.LG] https:\/\/arxiv.org\/abs\/2305.05176"},{"key":"e_1_3_2_1_12_1","unstructured":"NVIDIA Corporation. 2022. NVIDIA H100 Tensor Core GPU Datasheet. https:\/\/resources.nvidia.com\/en-us-tensor-core\/nvidia-tensor-core-gpu-datasheet"},{"key":"e_1_3_2_1_13_1","volume-title":"NVLM: Open Frontier-Class Multimodal LLMs. arXiv:2409.11402 https:\/\/arxiv.org\/abs\/2409.11402","author":"Dai Wenliang","year":"2024","unstructured":"Wenliang Dai, Nayeon Lee, Boxin Wang, Zhuolin Yang, Zihan Liu, Jon Barker, Tuomas Rintamaki, Mohammad Shoeybi, Bryan Catanzaro, and Wei Ping. 2024. NVLM: Open Frontier-Class Multimodal LLMs. arXiv:2409.11402 https:\/\/arxiv.org\/abs\/2409.11402"},{"key":"e_1_3_2_1_14_1","unstructured":"Databricks. 2025. Databricks Large Language Model Serving. https:\/\/docs.databricks.com\/en\/large-language-models\/index.html."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451125"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541941"},{"key":"e_1_3_2_1_17_1","unstructured":"Google Cloud. 2025. Google Cloud Platform. https:\/\/cloud.google.com\/."},{"key":"e_1_3_2_1_18_1","unstructured":"Aaron Grattafiori Abhimanyu Dubey Abhinav Jauhri Abhinav Pandey Abhishek Kadian Ahmad Al-Dahle Aiesha Letman Akhil Mathur et al. 2024. The Llama 3 Herd of Models. arXiv:2407.21783 [cs.CL] https:\/\/arxiv.org\/abs\/2407.21783"},{"key":"e_1_3_2_1_19_1","unstructured":"Aaron Hurst Adam Lerer Adam P. Goucher Adam Perelman Aditya Ramesh et al. 2024. GPT-4o System Card. arXiv:2410.21276 [cs.CL] https:\/\/arxiv.org\/abs\/2410.21276"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/356924.356928"},{"key":"e_1_3_2_1_21_1","unstructured":"Eric Jonas Johann Schleier-Smith Vikram Sreekanti Chia-Che Tsai Anurag Khandelwal Qifan Pu Vaishaal Shankar Joao Carreira Karl Krauth Neeraja Yadwadkar et al. 2019. Cloud programming simplified: A berkeley view on serverless computing. arXiv preprint arXiv:1902.03383 (2019)."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"Norman P. Jouppi George Kurian Sheng Li Peter Ma Rahul Nagarajan Lifeng Nai Nishant Patil Suvinay Subramanian Andy Swing Brian Towles Cliff Young Xiang Zhou Zongwei Zhou and David Patterson. 2023. TPU v4: An Optically Reconfigurable Supercomputer for Machine Learning with Hardware Support for Embeddings. arXiv:2304.01433 [cs.AR] https:\/\/arxiv.org\/abs\/2304.01433","DOI":"10.1145\/3579371.3589350"},{"key":"e_1_3_2_1_23_1","unstructured":"LangChain. 2024. LangChain. https:\/\/github.com\/langchain-ai\/langchain."},{"key":"e_1_3_2_1_24_1","volume-title":"Zui Chen, Michael Franklin, Tim Kraska, Samuel Madden, and Gerardo Vitagliano.","author":"Liu Chunwei","year":"2024","unstructured":"Chunwei Liu, Matthew Russo, Michael Cafarella, Lei Cao, Peter Baille Chen, Zui Chen, Michael Franklin, Tim Kraska, Samuel Madden, and Gerardo Vitagliano. 2024. A Declarative System for Optimizing AI Workloads. arXiv:2405.14696 [cs.CL]"},{"key":"e_1_3_2_1_25_1","unstructured":"Jerry Liu. 2022. LlamaIndex. https:\/\/github.com\/jerryjliu\/llama_index."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.14778\/3685800.3685916"},{"key":"e_1_3_2_1_27_1","unstructured":"Microsoft. 2025. Microsoft Azure. https:\/\/azure.microsoft.com\/."},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA. 2024. NVIDIA A100 Tensor Core GPU Datasheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf"},{"key":"e_1_3_2_1_29_1","unstructured":"OpenAI. 2023. Function Calling Guide. https:\/\/platform.openai.com\/docs\/guides\/function-calling"},{"key":"e_1_3_2_1_30_1","unstructured":"OpenAI. 2023. OpenAI API Reference. https:\/\/platform.openai.com\/docs\/api-reference\/introduction"},{"key":"e_1_3_2_1_31_1","unstructured":"OpenAI. 2025. OpenAI Large Language Models and API. https:\/\/platform.openai.com\/docs\/."},{"key":"e_1_3_2_1_32_1","volume-title":"Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Saurabh Sastry, Amanda Askell, Pam Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever.","author":"Radford Alec","year":"2021","unstructured":"Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Saurabh Sastry, Amanda Askell, Pam Mishkin, Jack Clark, Gretchen Krueger, and Ilya Sutskever. 2021. Learning Transferable Visual Models From Natural Language Supervision. https:\/\/openai.com\/research\/clip. OpenAI."},{"key":"e_1_3_2_1_33_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever.","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. 2022. Robust Speech Recognition via Large-Scale Weak Supervision. https:\/\/openai.com\/research\/whisper. OpenAI."},{"key":"e_1_3_2_1_34_1","volume-title":"Samuel Kriman, Somshubra Majumdar, Vahid Noroozi, He Huang, Oleksii Hrinchuk, Krishna Puvvada, Ankur Kumar, Jagadeesh Balam, and Boris Ginsburg.","author":"Rekesh Dima","year":"2023","unstructured":"Dima Rekesh, Nithin Rao Koluguri, Samuel Kriman, Somshubra Majumdar, Vahid Noroozi, He Huang, Oleksii Hrinchuk, Krishna Puvvada, Ankur Kumar, Jagadeesh Balam, and Boris Ginsburg. 2023. Fast Conformer with Linearly Scalable Attention for Efficient Speech Recognition. arXiv:2305.05084 [eess.AS] https:\/\/arxiv.org\/abs\/2305.05084"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243183"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3593856.3595893"},{"key":"e_1_3_2_1_37_1","unstructured":"Amazon Web Services. 2024. Amazon SageMaker: Build Train and Deploy Machine Learning Models at Scale. https:\/\/aws.amazon.com\/sagemaker\/"},{"key":"e_1_3_2_1_38_1","unstructured":"Stanford NLP Group. 2023. DSPy: The Framework for Programming---Not Prompting---Language Models. https:\/\/github.com\/stanfordnlp\/dspy."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","unstructured":"Ion Stoica and Scott Shenker. 2021. From Cloud Computing to Sky Computing. In HotOS. https:\/\/doi.org\/10.1145\/3458336.3465301","DOI":"10.1145\/3458336.3465301"},{"key":"e_1_3_2_1_40_1","volume-title":"Chi, Quoc Le, and Denny Zhou","author":"Wei Jason","year":"2023","unstructured":"Jason Wei, Xuezhi Wang, Dale Schuurmans, Maarten Bosma, Brian Ichter, Fei Xia, Ed Chi, Quoc Le, and Denny Zhou. 2023. Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. arXiv:2201.11903 [cs.CL] https:\/\/arxiv.org\/abs\/2201.11903"},{"key":"e_1_3_2_1_41_1","unstructured":"Shunyu Yao Jeffrey Zhao Dian Yu Nan Du Izhak Shafran Karthik Narasimhan and Yuan Cao. 2023. ReAct: Synergizing Reasoning and Acting in Language Models. arXiv:2210.03629 [cs.CL] https:\/\/arxiv.org\/abs\/2210.03629"},{"key":"e_1_3_2_1_42_1","volume-title":"Heather Miller, Chris Potts, James Zou, Michael Carbin, Jonathan Frankle, Naveen Rao, and Ali Ghodsi.","author":"Zaharia Matei","year":"2024","unstructured":"Matei Zaharia, Omar Khattab, Lingjiao Chen, Jared Quincy Davis, Heather Miller, Chris Potts, James Zou, Michael Carbin, Jonathan Frankle, Naveen Rao, and Ali Ghodsi. 2024. The Shift from Models to Compound AI Systems. https:\/\/bair.berkeley.edu\/blog\/2024\/02\/18\/compound-ai-systems\/."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Lu Zhang Tiancheng Zhao Heting Ying Yibo Ma and Kyusong Lee. 2024. OmAgent: A Multi-modal Agent Framework for Complex Video Understanding with Task Divide-and-Conquer. arXiv:2406.16620 [cs.CL] https:\/\/arxiv.org\/abs\/2406.16620","DOI":"10.18653\/v1\/2024.emnlp-main.559"}],"event":{"name":"HOTOS '25: Workshop on Hot Topics in Operating Systems","location":"Banff AB Canada","acronym":"HOTOS '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the Workshop on Hot Topics in Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713082.3730377","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3713082.3730377","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T16:48:40Z","timestamp":1756486120000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3713082.3730377"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,14]]},"references-count":43,"alternative-id":["10.1145\/3713082.3730377","10.1145\/3713082"],"URL":"https:\/\/doi.org\/10.1145\/3713082.3730377","relation":{},"subject":[],"published":{"date-parts":[[2025,5,14]]},"assertion":[{"value":"2025-06-06","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}