{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T23:12:12Z","timestamp":1768345932894,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":67,"publisher":"ACM","funder":[{"name":"Alibaba Group and Hong Kong RGC","award":["HKU 17205824 and C7004-22G (CRF)"],"award-info":[{"award-number":["HKU 17205824 and C7004-22G (CRF)"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,19]]},"DOI":"10.1145\/3772052.3772218","type":"proceedings-article","created":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:19:00Z","timestamp":1768321140000},"page":"111-124","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["DyOrc: Efficient Serving of Dynamic Machine Learning Workflows"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0838-1883","authenticated-orcid":false,"given":"Shiwei","family":"Zhang","sequence":"first","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6193-6126","authenticated-orcid":false,"given":"Lansong","family":"Diao","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-3626-6538","authenticated-orcid":false,"given":"Zisheng","family":"Meng","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-4064-6984","authenticated-orcid":false,"given":"Siyu","family":"Wang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3003-0150","authenticated-orcid":false,"given":"Wei","family":"Lin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3144-4398","authenticated-orcid":false,"given":"Chuan","family":"Wu","sequence":"additional","affiliation":[{"name":"The University of Hong Kong, Hong Kong, Hong Kong"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2026,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"https:\/\/airflow.apache.org\/docs\/apache-airflow\/stable\/project.html. Accessed","author":"Airflow Apache","year":"2023","unstructured":"2014. Apache Airflow. https:\/\/airflow.apache.org\/docs\/apache-airflow\/stable\/project.html. Accessed: 19 Nov. 2023."},{"key":"e_1_3_2_1_2_1","volume-title":"https:\/\/github.com\/google\/flatbuffers. Accessed","year":"2024","unstructured":"2014. FlatBuffers. https:\/\/github.com\/google\/flatbuffers. Accessed: 8 Oct. 2024."},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/airflow.apache.org\/docs\/apache-airflow\/stable\/core-concepts\/xcoms.html. Accessed","year":"2025","unstructured":"2014. XComs. https:\/\/airflow.apache.org\/docs\/apache-airflow\/stable\/core-concepts\/xcoms.html. Accessed: 11 Apr. 2025."},{"key":"e_1_3_2_1_4_1","volume-title":"https:\/\/cadenceworkflow.io\/. Accessed","year":"2024","unstructured":"2017. Cadence. https:\/\/cadenceworkflow.io\/. Accessed: 1 Oct. 2024."},{"key":"e_1_3_2_1_5_1","volume-title":"The Plasma In-Memory Object Store. https:\/\/ray-project.github.io\/2017\/08\/08\/plasma-in-memory-object-store.html. Accessed","year":"2025","unstructured":"2017. The Plasma In-Memory Object Store. https:\/\/ray-project.github.io\/2017\/08\/08\/plasma-in-memory-object-store.html. Accessed: 11 Apr. 2025."},{"key":"e_1_3_2_1_6_1","volume-title":"https:\/\/dagster.io\/. Accessed","year":"2024","unstructured":"2018. Dagster. https:\/\/dagster.io\/. Accessed: 1 Oct. 2024."},{"key":"e_1_3_2_1_7_1","volume-title":"Triton Inference Server. https:\/\/github.com\/triton-inference-server\/server. Accessed","year":"2025","unstructured":"2018. Triton Inference Server. https:\/\/github.com\/triton-inference-server\/server. Accessed: 11 Apr. 2025."},{"key":"e_1_3_2_1_8_1","volume-title":"https:\/\/docs.ray.io\/en\/latest\/serve\/index.html. Accessed","author":"Serve Ray","year":"2023","unstructured":"2019. Ray Serve. https:\/\/docs.ray.io\/en\/latest\/serve\/index.html. Accessed: 19 Nov. 2023."},{"key":"e_1_3_2_1_9_1","volume-title":"https:\/\/github.com\/NVIDIA\/TensorRT. Accessed","author":"RT.","year":"2024","unstructured":"2019. TensorRT. https:\/\/github.com\/NVIDIA\/TensorRT. Accessed: 7 Oct. 2024."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/github.com\/kserve\/kserve. Accessed","year":"2024","unstructured":"2021. KServe. https:\/\/github.com\/kserve\/kserve. Accessed: 7 Oct. 2024."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/github.com\/pytorch\/pytorch\/issues\/64932. Accessed","year":"2024","unstructured":"2021. TorchStore. https:\/\/github.com\/pytorch\/pytorch\/issues\/64932. Accessed: 8 Oct. 2024."},{"key":"e_1_3_2_1_12_1","volume-title":"https:\/\/github.com\/comfyanonymous\/ComfyUI\/. Accessed","author":"UI.","year":"2024","unstructured":"2024. ComfyUI. https:\/\/github.com\/comfyanonymous\/ComfyUI\/. Accessed: 1 Dec. 2024."},{"key":"e_1_3_2_1_13_1","volume-title":"https:\/\/www.coze.com\/. Accessed","year":"2024","unstructured":"2024. Coze. https:\/\/www.coze.com\/. Accessed: 18 Oct. 2024."},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/github.com\/langflow-ai\/langflow\/. Accessed","year":"2024","unstructured":"2024. Langflow. https:\/\/github.com\/langflow-ai\/langflow\/. Accessed: 1 Dec. 2024."},{"key":"e_1_3_2_1_15_1","volume-title":"https:\/\/github.com\/Netflix\/maestro\/. Accessed","year":"2024","unstructured":"2024. Maestro. https:\/\/github.com\/Netflix\/maestro\/. Accessed: 1 Oct. 2024."},{"key":"e_1_3_2_1_16_1","volume-title":"tarpc. https:\/\/github.com\/google\/tarpc. Accessed","year":"2024","unstructured":"2024. tarpc. https:\/\/github.com\/google\/tarpc. Accessed: 7 Dec. 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"Tensorflow: A system for large-scale machine learning. In 12th { USENIX} symposium on operating systems design and implementation ({ OSDI} 16).","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. 2016. Tensorflow: A system for large-scale machine learning. In 12th { USENIX} symposium on operating systems design and implementation ({ OSDI} 16)."},{"key":"e_1_3_2_1_18_1","volume-title":"Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245","author":"Ainslie Joshua","year":"2023","unstructured":"Joshua Ainslie, James Lee-Thorp, Michiel de Jong, Yury Zemlyanskiy, Federico Lebr\u00f3n, and Sumit Sanghai. 2023. Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv preprint arXiv:2305.13245 (2023)."},{"key":"e_1_3_2_1_19_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201315","author":"Ali Ahsan","year":"2020","unstructured":"Ahsan Ali, Riccardo Pinciroli, Feng Yan, and Evgenia Smirni. 2020. Batch: Machine learning inference serving on serverless platforms with adaptive batching. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1\u201315."},{"key":"e_1_3_2_1_20_1","volume-title":"List scheduling algorithm for heterogeneous systems by an optimistic cost table","author":"Arabnejad Hamid","year":"2013","unstructured":"Hamid Arabnejad and Jorge G Barbosa. 2013. List scheduling algorithm for heterogeneous systems by an optimistic cost table. IEEE transactions on parallel and distributed systems 25, 3 (2013), 682\u2013694."},{"key":"e_1_3_2_1_21_1","volume-title":"Reducing Transformer Key-Value Cache Size with Cross-Layer Attention. arXiv preprint arXiv:2405.12981","author":"Brandon William","year":"2024","unstructured":"William Brandon, Mayank Mishra, Aniruddha Nrusimha, Rameswar Panda, and Jonathan Ragan Kelly. 2024. Reducing Transformer Key-Value Cache Size with Cross-Layer Attention. arXiv preprint arXiv:2405.12981 (2024)."},{"key":"e_1_3_2_1_22_1","unstructured":"Tom B Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell et al. 2020. Language models are few-shot learners. arXiv preprint arXiv:2005.14165 (2020)."},{"key":"e_1_3_2_1_23_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. {TVM}: An automated {End-to-End} optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_2_1_24_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Choi Seungbeom","year":"2022","unstructured":"Seungbeom Choi, Sunho Lee, Yeonjae Kim, Jongse Park, Youngjin Kwon, and Jaehyuk Huh. 2022. Serving heterogeneous machine learning models on {Multi-GPU} servers with {Spatio-Temporal} sharing. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 199\u2013216."},{"key":"e_1_3_2_1_25_1","volume-title":"Apparate: Rethinking Early Exits to Tame Latency-Throughput Tensions in ML Serving. arXiv preprint arXiv:2312.05385","author":"Dai Yinwei","year":"2023","unstructured":"Yinwei Dai, Rui Pan, Anand Iyer, Kai Li, and Ravi Netravali. 2023. Apparate: Rethinking Early Exits to Tame Latency-Throughput Tensions in ML Serving. arXiv preprint arXiv:2312.05385 (2023)."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.121"},{"key":"e_1_3_2_1_27_1","volume-title":"Introducing pathways: A next-generation ai architecture. Google Blog 1","author":"Dean Jeff","year":"2021","unstructured":"Jeff Dean. 2021. Introducing pathways: A next-generation ai architecture. Google Blog 1 (2021)."},{"key":"e_1_3_2_1_28_1","unstructured":"Alexey Dosovitskiy Lucas Beyer Alexander Kolesnikov Dirk Weissenborn Xiaohua Zhai Thomas Unterthiner Mostafa Dehghani Matthias Minderer Georg Heigold Sylvain Gelly et al. 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929 (2020)."},{"key":"e_1_3_2_1_29_1","volume-title":"Forty-first International Conference on Machine Learning.","author":"Du Yilun","year":"2023","unstructured":"Yilun Du, Shuang Li, Antonio Torralba, Joshua B Tenenbaum, and Igor Mordatch. 2023. Improving factuality and reasoning in language models through multiagent debate. In Forty-first International Conference on Machine Learning."},{"key":"e_1_3_2_1_30_1","volume-title":"Markov processes","author":"Dynkin E.B.","unstructured":"E.B. Dynkin. 1965. Markov processes. Springer."},{"key":"e_1_3_2_1_31_1","first-page":"14","article-title":"ACROBAT: Optimizing Auto-batching of Dynamic Deep Learning at Compile Time","volume":"6","author":"Fegade Pratik","year":"2024","unstructured":"Pratik Fegade, Tianqi Chen, Phillip Gibbons, and Todd Mowry. 2024. ACROBAT: Optimizing Auto-batching of Dynamic Deep Learning at Compile Time. Proceedings of Machine Learning and Systems 6 (2024), 14\u201330.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_32_1","volume-title":"Knowledge Card: Filling LLMs' Knowledge Gaps with Plug-in Specialized Language Models. arXiv preprint arXiv:2305.09955","author":"Feng Shangbin","year":"2023","unstructured":"Shangbin Feng, Weijia Shi, Yuyang Bai, Vidhisha Balachandran, Tianxing He, and Yulia Tsvetkov. 2023. Knowledge Card: Filling LLMs' Knowledge Gaps with Plug-in Specialized Language Models. arXiv preprint arXiv:2305.09955 (2023)."},{"key":"e_1_3_2_1_33_1","volume-title":"Patterns of enterprise application architecture","author":"Fowler Martin","unstructured":"Martin Fowler. 2012. Patterns of enterprise application architecture. Addison-Wesley."},{"key":"e_1_3_2_1_34_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI24)","author":"Fu Yao","year":"2024","unstructured":"Yao Fu, Leyang Xue, Yeqi Huang, Andrei-Octavian Brabete, Dmitrii Ustiugov, Yuvraj Patel, and Luo Mai. 2024. {ServerlessLLM}:{Low-Latency} Serverless Inference for Large Language Models. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI24). 135\u2013153."},{"key":"e_1_3_2_1_35_1","volume-title":"Planting a seed of vision in large language model. arXiv preprint arXiv:2307.08041","author":"Ge Yuying","year":"2023","unstructured":"Yuying Ge, Yixiao Ge, Ziyun Zeng, Xintao Wang, and Ying Shan. 2023. Planting a seed of vision in large language model. arXiv preprint arXiv:2307.08041 (2023)."},{"key":"e_1_3_2_1_36_1","volume-title":"Making llama see and draw with seed tokenizer. arXiv preprint arXiv:2310.01218","author":"Ge Yuying","year":"2023","unstructured":"Yuying Ge, Sijie Zhao, Ziyun Zeng, Yixiao Ge, Chen Li, Xintao Wang, and Ying Shan. 2023. Making llama see and draw with seed tokenizer. arXiv preprint arXiv:2310.01218 (2023)."},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Document Analysis and Recognition. Springer, 270\u2013286","author":"Hamed Omar","year":"2024","unstructured":"Omar Hamed, Souhail Bakkali, Matthew Blaschko, Sien Moens, and Jordy Van Landeghem. 2024. Multimodal adaptive inference for document image classification with anytime early exiting. In International Conference on Document Analysis and Recognition. Springer, 270\u2013286."},{"key":"e_1_3_2_1_38_1","volume-title":"Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300","author":"Hendrycks Dan","year":"2020","unstructured":"Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. 2020. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300 (2020)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.00799"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3625549.3658657"},{"key":"e_1_3_2_1_41_1","volume-title":"GPU Technology Conference (GTC)","volume":"2","author":"Jeaugey Sylvain","year":"2017","unstructured":"Sylvain Jeaugey. 2017. Nccl 2.0. In GPU Technology Conference (GTC), Vol. 2."},{"key":"e_1_3_2_1_42_1","volume-title":"Generating images with multimodal language models. arXiv preprint arXiv:2305.17216","author":"Koh Jing Yu","year":"2023","unstructured":"Jing Yu Koh, Daniel Fried, and Ruslan Salakhutdinov. 2023. Generating images with multimodal language models. arXiv preprint arXiv:2305.17216 (2023)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_44_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663\u2013679."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3318464.3386126"},{"key":"e_1_3_2_1_46_1","volume-title":"Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803","author":"Lin Zhihang","year":"2024","unstructured":"Zhihang Lin, Mingbao Lin, Luxi Lin, and Rongrong Ji. 2024. Boosting Multimodal Large Language Models with Visual Tokens Withdrawal for Rapid Inference. arXiv preprint arXiv:2405.05803 (2024)."},{"key":"e_1_3_2_1_47_1","volume-title":"Visual instruction tuning. Advances in neural information processing systems 36","author":"Liu Haotian","year":"2024","unstructured":"Haotian Liu, Chunyuan Li, Qingyang Wu, and Yong Jae Lee. 2024. Visual instruction tuning. Advances in neural information processing systems 36 (2024)."},{"key":"e_1_3_2_1_48_1","volume-title":"Pack of LLMs: Model Fusion at Test-Time via Perplexity Optimization. arXiv preprint arXiv:2404.11531","author":"Mavromatis Costas","year":"2024","unstructured":"Costas Mavromatis, Petros Karypis, and George Karypis. 2024. Pack of LLMs: Model Fusion at Test-Time via Perplexity Optimization. arXiv preprint arXiv:2404.11531 (2024)."},{"key":"e_1_3_2_1_49_1","volume-title":"On-the-fly operation batching in dynamic computation graphs. Advances in Neural Information Processing Systems 30","author":"Neubig Graham","year":"2017","unstructured":"Graham Neubig, Yoav Goldberg, and Chris Dyer. 2017. On-the-fly operation batching in dynamic computation graphs. Advances in Neural Information Processing Systems 30 (2017)."},{"key":"e_1_3_2_1_50_1","volume-title":"Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston Christopher","year":"2017","unstructured":"Christopher Olston, Noah Fiedel, Kiril Gorovoy, Jeremiah Harmsen, Li Lao, Fangwei Li, Vinu Rajashekhar, Sukriti Ramesh, and Jordan Soyke. 2017. Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139 (2017)."},{"key":"e_1_3_2_1_51_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019), 8026\u20138037."},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of the 2023 ACM Symposium on Cloud Computing. 324\u2013340","author":"Pei Qiangyu","year":"2023","unstructured":"Qiangyu Pei, Yongjie Yuan, Haichuan Hu, Qiong Chen, and Fangming Liu. 2023. AsyFunc: A High-Performance and Resource-Efficient Serverless Inference System via Asymmetric Functions. In Proceedings of the 2023 ACM Symposium on Cloud Computing. 324\u2013340."},{"key":"e_1_3_2_1_53_1","volume-title":"Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1, 2","author":"Ramesh Aditya","year":"2022","unstructured":"Aditya Ramesh, Prafulla Dhariwal, Alex Nichol, Casey Chu, and Mark Chen. 2022. Hierarchical text-conditional image generation with clip latents. arXiv preprint arXiv:2204.06125 1, 2 (2022), 3."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_55_1","unstructured":"Mohammad Shahrad Rodrigo Fonseca Inigo Goiri Gohar Chaudhry Paul Batum Jason Cooke Eduardo Laureano Colby Tresness Mark Russinovich and Ricardo Bianchini. 2020. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20). 205\u2013218."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52733.2024.01365"},{"key":"e_1_3_2_1_57_1","volume-title":"Teola: Towards end-to-end optimization of llm-based applications. arXiv preprint arXiv:2407.00326","author":"Tan Xin","year":"2024","unstructured":"Xin Tan, Yimin Jiang, Yitao Yang, and Hong Xu. 2024. Teola: Towards end-to-end optimization of llm-based applications. arXiv preprint arXiv:2407.00326 (2024)."},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.993206"},{"key":"e_1_3_2_1_59_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_60_1","volume-title":"Look-m: Look-once optimization in kv cache for efficient multimodal long-context inference. arXiv preprint arXiv:2406.18139","author":"Wan Zhongwei","year":"2024","unstructured":"Zhongwei Wan, Ziang Wu, Che Liu, Jinfa Huang, Zhihong Zhu, Peng Jin, Longyue Wang, and Li Yuan. 2024. Look-m: Look-once optimization in kv cache for efficient multimodal long-context inference. arXiv preprint arXiv:2406.18139 (2024)."},{"key":"e_1_3_2_1_61_1","volume-title":"Transformers: State-of-the-Art Natural Language Processing. arXiv preprint arXiv:1910.03771","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf. 2020. Transformers: State-of-the-Art Natural Language Processing. arXiv preprint arXiv:1910.03771 (2020)."},{"key":"e_1_3_2_1_62_1","unstructured":"Xiangyuan Xue Zeyu Lu Di Huang Zidong Wang Wanli Ouyang and Lei Bai. 2024. ComfyBench: Benchmarking LLM-based Agents in ComfyUI for Autonomously Designing Collaborative AI Systems. arXiv:2409.01392 [cs.CL] https:\/\/arxiv.org\/abs\/2409.01392"},{"key":"e_1_3_2_1_63_1","volume-title":"A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549","author":"Yin Shukang","year":"2023","unstructured":"Shukang Yin, Chaoyou Fu, Sirui Zhao, Ke Li, Xing Sun, Tong Xu, and Enhong Chen. 2023. A Survey on Multimodal Large Language Models. arXiv preprint arXiv:2306.13549 (2023)."},{"key":"e_1_3_2_1_64_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_2_1_65_1","volume-title":"2021 IEEE 41st International Conference on Distributed Computing Systems (ICDCS). IEEE, 138\u2013148","author":"Yu Minchen","year":"2021","unstructured":"Minchen Yu, Zhifeng Jiang, Hok Chun Ng, Wei Wang, Ruichuan Chen, and Bo Li. 2021. Gillis: Serving large neural networks in serverless functions with automatic model partitioning. In 2021 IEEE 41st International Conference on Distributed Computing Systems (ICDCS). IEEE, 138\u2013148."},{"key":"e_1_3_2_1_66_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. 2019. { MArk}: Exploiting cloud services for {Cost-Effective},{SLO-Aware} machine learning inference serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). 1049\u20131062."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3567955.3567960"}],"event":{"name":"SoCC '25: ACM Symposium on Cloud Computing","location":"Online USA","acronym":"SoCC '25","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","SIGMOD ACM Special Interest Group on Management of Data"]},"container-title":["Proceedings of the 2025 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3772052.3772218","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,1,13]],"date-time":"2026-01-13T16:23:20Z","timestamp":1768321400000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3772052.3772218"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,19]]},"references-count":67,"alternative-id":["10.1145\/3772052.3772218","10.1145\/3772052"],"URL":"https:\/\/doi.org\/10.1145\/3772052.3772218","relation":{},"subject":[],"published":{"date-parts":[[2025,11,19]]},"assertion":[{"value":"2026-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}