{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T21:31:39Z","timestamp":1765229499028,"version":"3.46.0"},"publisher-location":"New York, NY, USA","reference-count":88,"publisher":"ACM","content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,12,15]]},"DOI":"10.1145\/3721462.3770765","type":"proceedings-article","created":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T19:56:49Z","timestamp":1765223809000},"page":"45-58","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Argus: Quality-Aware High-Throughput Text-to-Image Inference Serving System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-3290-6328","authenticated-orcid":false,"given":"Shubham","family":"Agarwal","sequence":"first","affiliation":[{"name":"UC Berkeley, Berkeley, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8436-3119","authenticated-orcid":false,"given":"Subrata","family":"Mitra","sequence":"additional","affiliation":[{"name":"Adobe Research, Bangalore, India"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-8581-2532","authenticated-orcid":false,"given":"Saud","family":"Iqbal","sequence":"additional","affiliation":[{"name":"Independent Researcher, Bangalore, India"}]}],"member":"320","published-online":{"date-parts":[[2025,12,14]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"A100 core: Powering high-performance computing platform | nvidia. https:\/\/www.nvidia.com\/en-in\/data-center\/a100\/. (Accessed on 05\/22\/2024)."},{"key":"e_1_3_2_1_2_1","unstructured":"Accelerate. https:\/\/huggingface.co\/docs\/accelerate\/v0.11.0\/en\/index. (Accessed on 05\/22\/2024)."},{"key":"e_1_3_2_1_3_1","unstructured":"archiveteam-twitter-stream-2018-10 directory listing. https:\/\/archive.org\/download\/archiveteam-twitter-stream-2018-10. (Accessed on 05\/22\/2024)."},{"key":"e_1_3_2_1_4_1","unstructured":"Complete guide to samplers in stable diffusion - f\u00e9lix sanz. https:\/\/www.felixsanz.dev\/articles\/complete-guide-to-samplers-in-stable-diffusion. (Accessed on 09\/21\/2023)."},{"key":"e_1_3_2_1_5_1","unstructured":"Hugging face - the ai community building the future. https:\/\/huggingface.co\/. (Accessed on 01\/17\/2024)."},{"key":"e_1_3_2_1_6_1","unstructured":"Models - Hugging Face \u2014 huggingface.co. https:\/\/huggingface.co\/models?pipeline_tag=text-to-image. [Accessed 03-06-2025]."},{"key":"e_1_3_2_1_7_1","unstructured":"New - ec2 instances (g5) with nvidia a10g tensor core gpus | aws news blog. https:\/\/aws.amazon.com\/blogs\/aws\/new-ec2-instances-g5-with-nvidia-a10g-tensor-core-gpus\/. (Accessed on 05\/21\/2024)."},{"key":"e_1_3_2_1_8_1","unstructured":"Stable diffusion public release \u2014 stability ai. https:\/\/stability.ai\/blog\/stable-diffusion-public-release. (Accessed on 09\/20\/2023)."},{"key":"e_1_3_2_1_9_1","unstructured":"Text-to-image ai | google cloud. https:\/\/cloud.google.com\/use-cases\/text-to-image-ai?hl=en. (Accessed on 05\/22\/2024)."},{"key":"e_1_3_2_1_10_1","volume-title":"https:\/\/www.adobe.com\/sensei\/generative-ai\/firefly.html","author":"Adobe","year":"2023","unstructured":"Adobe firefly. https:\/\/www.adobe.com\/sensei\/generative-ai\/firefly.html. 2023."},{"key":"e_1_3_2_1_11_1","volume-title":"https:\/\/azure.microsoft.com\/en-us\/products\/machine-learning","author":"Azure","year":"2023","unstructured":"Azure machine learning. https:\/\/azure.microsoft.com\/en-us\/products\/machine-learning. 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"https:\/\/openai.com\/dall-e-2","year":"2023","unstructured":"Dall-e 2. https:\/\/openai.com\/dall-e-2. 2023."},{"key":"e_1_3_2_1_13_1","volume-title":"3x the speed of stable diffusion with the same quality. https:\/\/deci.ai\/blog\/decidiffusion-1-0-3x-faster-than-stable-diffusion-same-quality\/","author":"Introducing","year":"2023","unstructured":"Introducing decidiffusion 1.0: : 3x the speed of stable diffusion with the same quality. https:\/\/deci.ai\/blog\/decidiffusion-1-0-3x-faster-than-stable-diffusion-same-quality\/. 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"https:\/\/www.midjourney.com\/home\/","year":"2023","unstructured":"Midjourney. https:\/\/www.midjourney.com\/home\/. 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"https:\/\/qdrant.tech\/","author":"Qdrant","year":"2023","unstructured":"Qdrant - vector database. https:\/\/qdrant.tech\/, 2023."},{"key":"e_1_3_2_1_16_1","volume-title":"build, train, and deploy machine learning models at scale. https:\/\/aws.amazon.com\/sagemaker\/","author":"Amazon","year":"2024","unstructured":"Amazon sagemaker. build, train, and deploy machine learning models at scale. https:\/\/aws.amazon.com\/sagemaker\/, 2024."},{"key":"e_1_3_2_1_17_1","volume-title":"https:\/\/developer.nvidia.com\/tensorrt","author":"Nvidia","year":"2024","unstructured":"Nvidia tensorrt:programmable inference accelerator. https:\/\/developer.nvidia.com\/tensorrt, 2024."},{"key":"e_1_3_2_1_18_1","volume-title":"https:\/\/www.tensorflow.org\/tfx\/guide\/serving","author":"Tensorflow","year":"2024","unstructured":"Tensorflow serving for model deployment in production. https:\/\/www.tensorflow.org\/tfx\/guide\/serving, 2024."},{"key":"e_1_3_2_1_19_1","volume-title":"https:\/\/developer.nvidia.com\/nvidiatriton-inference-server","author":"Triton","year":"2024","unstructured":"Triton inference server. https:\/\/developer.nvidia.com\/nvidiatriton-inference-server, 2024."},{"key":"e_1_3_2_1_20_1","volume-title":"Approximate caching for efficiently serving diffusion models. arXiv preprint arXiv:2312.04429","author":"Agarwal Shubham","year":"2023","unstructured":"Shubham Agarwal, Subrata Mitra, Sarthak Chakraborty, Srikrishna Karanam, Koyel Mukherjee, and Shiv Saini. Approximate caching for efficiently serving diffusion models. arXiv preprint arXiv:2312.04429, 2023."},{"issue":"3","key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3725273","article-title":"Managing chunk-caches for efficient retrieval-augmented generation","volume":"3","author":"Agarwal Shubham","year":"2025","unstructured":"Shubham Agarwal, Sai Sundaresan, Subrata Mitra, Debabrata Mahapatra, Archit Gupta, Rounak Sharma, Nirmal Joshua Kapu, Tong Yu, and Shiv Saini. Cache-craft: Managing chunk-caches for efficient retrieval-augmented generation. Proceedings of the ACM on Management of Data, 3(3):1\u201328, 2025.","journal-title":"Proceedings of the ACM on Management of Data"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3759441.3759444"},{"key":"e_1_3_2_1_23_1","volume-title":"Proteus: A high-throughput inference-serving system with accuracy scaling","author":"Ahmad Sohaib","year":"2024","unstructured":"Sohaib Ahmad, Hui Guan, Brian D. Friedman, Thomas Williams, Ramesh K. Sitaraman, and Thomas Woo. Proteus: A high-throughput inference-serving system with accuracy scaling. 2024."},{"key":"e_1_3_2_1_24_1","first-page":"15","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Ali Ahsan","unstructured":"Ahsan Ali, Riccardo Pinciroli, Feng Yan, and Evgenia Smirni. Batch: machine learning inference serving on serverless platforms with adaptive batching. In SC20: International Conference for High Performance Computing, Networking, Storage and Analysis, pages 1\u201315. IEEE, 2020."},{"key":"e_1_3_2_1_25_1","volume-title":"NeurIPS","author":"Anonymous","year":"2024","unstructured":"Anonymous. Real: Realism evaluation of text-to-image generation models for effective data augmentation. In NeurIPS, 2024. Under review."},{"key":"e_1_3_2_1_26_1","volume-title":"A survey on quality metrics for text-to-image generation. arXiv preprint arXiv:2403.11821","author":"Anonymous","year":"2024","unstructured":"Anonymous. A survey on quality metrics for text-to-image generation. arXiv preprint arXiv:2403.11821, 2024."},{"key":"e_1_3_2_1_27_1","first-page":"33","volume-title":"2019 IEEE International Conference on Cloud Engineering (IC2E)","author":"Bhattacharjee Anirban","unstructured":"Anirban Bhattacharjee, Ajay Dev Chhokra, Zhuangwei Kang, Hongyang Sun, Aniruddha Gokhale, and Gabor Karsai. Barista: Efficient and scalable serverless serving system for deep learning prediction services. In 2019 IEEE International Conference on Cloud Engineering (IC2E), pages 23\u201333. IEEE, 2019."},{"key":"e_1_3_2_1_28_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877\u20131901, 2020."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of EMNLP","author":"Cao Jingtao","year":"2024","unstructured":"Jingtao Cao, Zheng Zhang, Hongru Wang, and Kam-Fai Wong. Vleu: A method for automatic evaluation of text-to-image model generalizability. In Proceedings of EMNLP, 2024."},{"key":"e_1_3_2_1_30_1","volume-title":"IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR)","author":"Chen Muxi","year":"2024","unstructured":"Muxi Chen, Yi Liu, Jian Yi, Changran Xu, Qiuxia Lai, Hongliang Wang, Tsung-Yi Ho, and Qiang Xu. Evaluating text-to-image generative models: An empirical study on human image synthesis. In IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), 2024."},{"key":"e_1_3_2_1_31_1","volume-title":"Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811","author":"Chen Xiaokang","year":"2025","unstructured":"Xiaokang Chen, Zhiyu Wu, Xingchao Liu, Zizheng Pan, Wen Liu, Zhenda Xie, Xingkai Yu, and Chong Ruan. Janus-pro: Unified multimodal understanding and generation with data and model scaling. arXiv preprint arXiv:2501.17811, 2025."},{"key":"e_1_3_2_1_32_1","first-page":"627","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael J Franklin, Joseph E Gonzalez, and Ion Stoica. Clipper: A {Low-Latency} online prediction serving system. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), pages 613\u2013627, 2017."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2023.3261988"},{"key":"e_1_3_2_1_34_1","volume-title":"Diffusion models beat gans on image synthesis. Advances in neural information processing systems, 34:8780\u20138794","author":"Dhariwal Prafulla","year":"2021","unstructured":"Prafulla Dhariwal and Alexander Nichol. Diffusion models beat gans on image synthesis. Advances in neural information processing systems, 34:8780\u20138794, 2021."},{"key":"e_1_3_2_1_35_1","volume-title":"Structural pruning for diffusion models. arXiv preprint arXiv:2305.10924","author":"Fang Gongfan","year":"2023","unstructured":"Gongfan Fang, Xinyin Ma, and Xinchao Wang. Structural pruning for diffusion models. arXiv preprint arXiv:2305.10924, 2023."},{"key":"e_1_3_2_1_36_1","volume-title":"Who knows the answer? finding the best model and prompt for each query using confidence-based search. AAAI'24\/IAAI'24\/EAAI'24","author":"Gerych Walter","year":"2024","unstructured":"Walter Gerych, Yara Rizk, Vatche Isahagian, Vinod Muthusamy, Evelyn Duesterwald, and Praveen Venkateswaran. Who knows the answer? finding the best model and prompt for each query using confidence-based search. AAAI'24\/IAAI'24\/EAAI'24. AAAI Press, 2024."},{"key":"e_1_3_2_1_37_1","first-page":"1057","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Gunasekaran Jashwant Raj","year":"2022","unstructured":"Jashwant Raj Gunasekaran, Cyan Subhra Mishra, Prashanth Thinakaran, Bikash Sharma, Mahmut Taylan Kandemir, and Chita R Das. Cocktail: A multidimensional optimization for model serving in cloud. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22), pages 1041\u20131057, 2022."},{"key":"e_1_3_2_1_38_1","first-page":"1890","volume-title":"Proceedings of the 2022 International Conference on Management of Data","author":"Guo Peizhen","year":"2022","unstructured":"Peizhen Guo, Bo Hu, and Wenjun Hu. Sommelier: Curating dnn models for the masses. In Proceedings of the 2022 International Conference on Management of Data, pages 1876\u20131890, 2022."},{"key":"e_1_3_2_1_39_1","volume-title":"A survey on quality metrics for text-to-image generation. arXiv preprint arXiv:2403.11821","author":"Hartwig Sebastian","year":"2024","unstructured":"Sebastian Hartwig, Dominik Engel, Leon Sick, Hannah Kniesel, Tristan Payer, Poonam Poonam, Michael Gl\u00f6ckler, Alex B\u00e4uerle, and Timo Ropinski. A survey on quality metrics for text-to-image generation. arXiv preprint arXiv:2403.11821, 2024."},{"key":"e_1_3_2_1_40_1","volume-title":"Fastdecode: High-throughput gpu-efficient llm serving using heterogeneous pipelines. arXiv preprint arXiv:2403.11421","author":"He Jiaao","year":"2024","unstructured":"Jiaao He and Jidong Zhai. Fastdecode: High-throughput gpu-efficient llm serving using heterogeneous pipelines. arXiv preprint arXiv:2403.11421, 2024."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_42_1","first-page":"36","article-title":"Accurate post-training quantization for diffusion models","author":"He Yefei","year":"2024","unstructured":"Yefei He, Luping Liu, Jing Liu, Weijia Wu, Hong Zhou, and Bohan Zhuang. Ptqd: Accurate post-training quantization for diffusion models. Advances in Neural Information Processing Systems, 36, 2024.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_43_1","volume-title":"Ronan Le Bras, and Yejin Choi. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718","author":"Hessel Jack","year":"2021","unstructured":"Jack Hessel, Ari Holtzman, Maxwell Forbes, Ronan Le Bras, and Yejin Choi. Clipscore: A reference-free evaluation metric for image captioning. arXiv preprint arXiv:2104.08718, 2021."},{"key":"e_1_3_2_1_44_1","volume-title":"Denoising diffusion probabilistic models. Advances in neural information processing systems, 33:6840\u20136851","author":"Ho Jonathan","year":"2020","unstructured":"Jonathan Ho, Ajay Jain, and Pieter Abbeel. Denoising diffusion probabilistic models. Advances in neural information processing systems, 33:6840\u20136851, 2020."},{"key":"e_1_3_2_1_45_1","volume-title":"Nxmtransformer: Semi-structured sparsification for natural language understanding via admm. Advances in neural information processing systems, 34:1818\u20131830","author":"Holmes Connor","year":"2021","unstructured":"Connor Holmes, Minjia Zhang, Yuxiong He, and Bo Wu. Nxmtransformer: Semi-structured sparsification for natural language understanding via admm. Advances in neural information processing systems, 34:1818\u20131830, 2021."},{"key":"e_1_3_2_1_46_1","volume-title":"International Conference on Learning Representations","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, Weizhu Chen, et al. Lora: Low-rank adaptation of large language models. In International Conference on Learning Representations, 2021."},{"key":"e_1_3_2_1_47_1","volume-title":"Benchmark vector search databases with one million data. https:\/\/jina.ai\/news\/benchmark-vector-search-databases-with-one-million-data\/","year":"2022","unstructured":"Jina.ai. Benchmark vector search databases with one million data. https:\/\/jina.ai\/news\/benchmark-vector-search-databases-with-one-million-data\/, 2022."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_49_1","first-page":"2","volume-title":"Proceedings of naacL-HLT","volume":"1","author":"Ming-Wei Chang Jacob Devlin","year":"2019","unstructured":"Jacob Devlin Ming-Wei Chang Kenton and Lee Kristina Toutanova. Bert: Pretraining of deep bidirectional transformers for language understanding. In Proceedings of naacL-HLT, volume 1, page 2, 2019."},{"key":"e_1_3_2_1_50_1","volume-title":"Pick-a-pic: An open dataset of user preferences for text-to-image generation. arXiv preprint arXiv:2305.01569","author":"Kirstain Yuval","year":"2023","unstructured":"Yuval Kirstain, Adam Polyak, Uriel Singer, Shahbuland Matiana, Joe Penna, and Omer Levy. Pick-a-pic: An open dataset of user preferences for text-to-image generation. arXiv preprint arXiv:2305.01569, 2023."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3600006.3613165"},{"key":"e_1_3_2_1_52_1","volume-title":"NSDI","author":"Lai Fan","year":"2023","unstructured":"Fan Lai, Yinwei Dai, Harsha V Madhyastha, and Mosharaf Chowdhury. {ModelKeeper}: Accelerating {DNN} training via automated training warmup. In NSDI, 2023."},{"key":"e_1_3_2_1_53_1","volume-title":"OSDI","author":"Lee Yunseong","year":"2018","unstructured":"Yunseong Lee, Alberto Scolari, Byung-Gon Chun, Marco Domenico Santambrogio, Markus Weimer, and Matteo Interlandi. {PRETZEL}: Opening the black box of machine learning prediction serving systems. In OSDI, 2018."},{"key":"e_1_3_2_1_54_1","volume-title":"Autoregressive image generation without vector quantization. arXiv preprint arXiv:2406.11838","author":"Li Tianhong","year":"2024","unstructured":"Tianhong Li, Yonglong Tian, He Li, Mingyang Deng, and Kaiming He. Autoregressive image generation without vector quantization. arXiv preprint arXiv:2406.11838, 2024. NeurIPS 2024 Spotlight."},{"key":"e_1_3_2_1_55_1","volume-title":"Q-diffusion: Quantizing diffusion models. arXiv","author":"Li Xiuyu","year":"2023","unstructured":"Xiuyu Li, Yijiang Liu, Long Lian, Huanrui Yang, Zhen Dong, Daniel Kang, Shanghang Zhang, and Kurt Keutzer. Q-diffusion: Quantizing diffusion models. arXiv, 2023."},{"key":"e_1_3_2_1_56_1","volume-title":"Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532","author":"Lin Yujun","year":"2024","unstructured":"Yujun Lin, Haotian Tang, Shang Yang, Zhekai Zhang, Guangxuan Xiao, Chuang Gan, and Song Han. Qserve: W4a8kv4 quantization and system co-design for efficient llm serving. arXiv preprint arXiv:2405.04532, 2024."},{"key":"e_1_3_2_1_57_1","volume-title":"Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291","author":"Lin Zhiqiu","year":"2024","unstructured":"Zhiqiu Lin, Deepak Pathak, Baiqi Li, Jiayao Li, Xide Xia, Graham Neubig, Pengchuan Zhang, and Deva Ramanan. Evaluating text-to-visual generation with image-to-text generation. arXiv preprint arXiv:2404.01291, 2024."},{"key":"e_1_3_2_1_58_1","first-page":"1040","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Liu Yizhi","year":"2019","unstructured":"Yizhi Liu, Yao Wang, Ruofei Yu, Mu Li, Vin Sharma, and Yida Wang. Optimizing {CNN} model inference on {CPUs}. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 1025\u20131040, 2019."},{"key":"e_1_3_2_1_59_1","first-page":"306","volume-title":"European Conference on Computer Vision","author":"Lu Chen-Yi","unstructured":"Chen-Yi Lu, Shubham Agarwal, Md Mehrab Tanjim, Kanak Mahadik, Anup Rao, Subrata Mitra, Shiv Kumar Saini, Saurabh Bagchi, and Somali Chaterji. Recon: Training-free acceleration for text-to-image synthesis with retrieval of concept prompt trajectories. In European Conference on Computer Vision, pages 288\u2013306. Springer, 2024."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52729.2023.01374"},{"key":"e_1_3_2_1_61_1","first-page":"9008","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"35","author":"Mondal Shanka Subhra","year":"2021","unstructured":"Shanka Subhra Mondal, Nikhil Sheoran, and Subrata Mitra. Scheduling of time-varying workloads using reinforcement learning. In Proceedings of the AAAI Conference on Artificial Intelligence, volume 35, pages 9000\u20139008, 2021."},{"key":"e_1_3_2_1_62_1","volume-title":"NeurIPS 2024 Workshop on Compositional Learning: Perspectives, Methods, and Paths Forward.","author":"Oriyad Arash Mari","unstructured":"Arash Mari Oriyad, Parham Rezaei, Mahdieh Soleymani Baghshah, and Mohammad Hossein Rohban. Diffusion beats autoregressive: An evaluation of compositional generation in text-to-image models. In NeurIPS 2024 Workshop on Compositional Learning: Perspectives, Methods, and Paths Forward."},{"key":"e_1_3_2_1_63_1","volume-title":"Sdxl: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952","author":"Podell Dustin","year":"2023","unstructured":"Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas M\u00fcller, Joe Penna, and Robin Rombach. Sdxl: improving latent diffusion models for high-resolution image synthesis. arXiv preprint arXiv:2307.01952, 2023."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"e_1_3_2_1_65_1","first-page":"411","volume-title":"2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Romero Francisco","year":"2021","unstructured":"Francisco Romero, Qian Li, Neeraja J Yadwadkar, and Christos Kozyrakis. { INFaaS}: Automated model-less inference serving. In 2021 USENIX Annual Technical Conference (USENIX ATC 21), pages 397\u2013411, 2021."},{"key":"e_1_3_2_1_66_1","volume-title":"International Conference on Learning Representations","author":"Salimans Tim","year":"2021","unstructured":"Tim Salimans and Jonathan Ho. Progressive distillation for fast sampling of diffusion models. In International Conference on Learning Representations, 2021."},{"key":"e_1_3_2_1_67_1","first-page":"218","volume-title":"Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020","unstructured":"Mohammad Shahrad, Rodrigo Fonseca, Inigo Goiri, Gohar Chaudhry, Paul Batum, Jason Cooke, Eduardo Laureano, Colby Tresness, Mark Russinovich, and Ricardo Bianchini. Serverless in the wild: Characterizing and optimizing the serverless workload at a large cloud provider. In 2020 USENIX annual technical conference (USENIX ATC 20), pages 205\u2013218, 2020."},{"key":"e_1_3_2_1_68_1","first-page":"1981","volume-title":"Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition","author":"Shang Yuzhang","year":"2023","unstructured":"Yuzhang Shang, Zhihang Yuan, Bin Xie, Bingzhe Wu, and Yan Yan. Post-training quantization on diffusion models. In Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pages 1972\u20131981, 2023."},{"key":"e_1_3_2_1_69_1","first-page":"337","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles","author":"Shen Haichen","year":"2019","unstructured":"Haichen Shen, Lequn Chen, Yuchen Jin, Liangyu Zhao, Bingyu Kong, Matthai Philipose, Arvind Krishnamurthy, and Ravi Sundaram. Nexus: A gpu cluster engine for accelerating dnn-based video analysis. In Proceedings of the 27th ACM Symposium on Operating Systems Principles, pages 322\u2013337, 2019."},{"key":"e_1_3_2_1_70_1","volume-title":"Autoregressive model beats diffusion: Llama for scalable image generation. arXiv preprint arXiv:2406.06525","author":"Sun Peize","year":"2024","unstructured":"Peize Sun, Yi Jiang, Shoufa Chen, Shilong Zhang, Bingyue Peng, Ping Luo, and Zehuan Yuan. Autoregressive model beats diffusion: Llama for scalable image generation. arXiv preprint arXiv:2406.06525, 2024."},{"key":"e_1_3_2_1_71_1","first-page":"6114","volume-title":"International conference on machine learning","author":"Tan Mingxing","unstructured":"Mingxing Tan and Quoc Le. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning, pages 6105\u20136114. PMLR, 2019."},{"key":"e_1_3_2_1_72_1","volume-title":"Automatic evaluation for text-to-image generation: Task-decomposed framework, distilled training, and meta-evaluation benchmark. arXiv preprint arXiv:2411.15488","author":"Tu Rong-Cheng","year":"2024","unstructured":"Rong-Cheng Tu, Zi-Ao Ma, Tian Lan, Yuehao Zhao, Heyan Huang, and Xian-Ling Mao. Automatic evaluation for text-to-image generation: Task-decomposed framework, distilled training, and meta-evaluation benchmark. arXiv preprint arXiv:2411.15488, 2024."},{"key":"e_1_3_2_1_73_1","first-page":"10","volume-title":"Proceedings of the 48th International Conference on Parallel Processing","author":"Wang Leyuan","year":"2019","unstructured":"Leyuan Wang, Zhi Chen, Yizhi Liu, Yao Wang, Lianmin Zheng, Mu Li, and Yida Wang. A unified optimization approach for cnn model inference on integrated gpus. In Proceedings of the 48th International Conference on Parallel Processing, pages 1\u201310, 2019."},{"key":"e_1_3_2_1_74_1","first-page":"248","volume-title":"Proceedings of the Eighteenth European Conference on Computer Systems","author":"Wang Yiding","year":"2023","unstructured":"Yiding Wang, Kai Chen, Haisheng Tan, and Kun Guo. Tabi: An efficient multilevel inference system for large language models. In Proceedings of the Eighteenth European Conference on Computer Systems, pages 233\u2013248, 2023."},{"key":"e_1_3_2_1_75_1","volume-title":"Amelie Chi Zhou, and Xiaowen Chu. Towards efficient and reliable llm serving: A real-world workload study. arXiv preprint arXiv:2401.17644","author":"Wang Yuxin","year":"2024","unstructured":"Yuxin Wang, Yuhan Chen, Zeyu Li, Zhenheng Tang, Rui Guo, Xin Wang, Qiang Wang, Amelie Chi Zhou, and Xiaowen Chu. Towards efficient and reliable llm serving: A real-world workload study. arXiv preprint arXiv:2401.17644, 2024."},{"key":"e_1_3_2_1_76_1","volume-title":"Diffusiondb: A large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:2210.14896","author":"Wang Zijie J","year":"2022","unstructured":"Zijie J Wang, Evan Montoya, David Munechika, Haoyang Yang, Benjamin Hoover, and Duen Horng Chau. Diffusiondb: A large-scale prompt gallery dataset for text-to-image generative models. arXiv preprint arXiv:2210.14896, 2022."},{"key":"e_1_3_2_1_77_1","volume-title":"What are diffusion models?","author":"Weng Lilian","year":"2021","unstructured":"Lilian Weng. What are diffusion models?. 2021."},{"key":"e_1_3_2_1_78_1","first-page":"191","volume-title":"Proceedings of the Workshop on Hot Topics in Operating Systems","author":"Yadwadkar Neeraja J","year":"2019","unstructured":"Neeraja J Yadwadkar, Francisco Romero, Qian Li, and Christos Kozyrakis. A case for managed and model-less inference serving. In Proceedings of the Workshop on Hot Topics in Operating Systems, pages 184\u2013191, 2019."},{"key":"e_1_3_2_1_79_1","first-page":"11886","volume-title":"International Conference on Machine Learning","author":"Yao Zhewei","unstructured":"Zhewei Yao, Zhen Dong, Zhangcheng Zheng, Amir Gholami, Jiali Yu, Eric Tan, Leyuan Wang, Qijing Huang, Yida Wang, Michael Mahoney, et al. Hawq-v3: Dyadic neural network quantization. In International Conference on Machine Learning, pages 11875\u201311886. PMLR, 2021."},{"key":"e_1_3_2_1_80_1","first-page":"27168","article-title":"Efficient and affordable post-training quantization for large-scale transformers","volume":"35","author":"Yao Zhewei","year":"2022","unstructured":"Zhewei Yao, Reza Yazdani Aminabadi, Minjia Zhang, Xiaoxia Wu, Conglong Li, and Yuxiong He. Zeroquant: Efficient and affordable post-training quantization for large-scale transformers. Advances in Neural Information Processing Systems, 35:27168\u201327183, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01053"},{"key":"e_1_3_2_1_82_1","first-page":"538","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. Orca: A distributed serving system for {Transformer-Based} generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 521\u2013538, 2022."},{"key":"e_1_3_2_1_83_1","first-page":"1062","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019","unstructured":"Chengliang Zhang, Minchen Yu, Wei Wang, and Feng Yan. {MArk}: Exploiting cloud services for {Cost-Effective},{SLO-Aware} machine learning inference serving. In 2019 USENIX Annual Technical Conference (USENIX ATC 19), pages 1049\u20131062, 2019."},{"key":"e_1_3_2_1_84_1","volume-title":"Laptop-diff: Layer pruning and normalized distillation for compressing diffusion models. arXiv preprint arXiv:2404.11098","author":"Zhang Dingkun","year":"2024","unstructured":"Dingkun Zhang, Sijia Li, Chen Chen, Qingsong Xie, and Haonan Lu. Laptop-diff: Layer pruning and normalized distillation for compressing diffusion models. arXiv preprint arXiv:2404.11098, 2024."},{"key":"e_1_3_2_1_85_1","volume-title":"12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20)","author":"Zhang Jeff","year":"2020","unstructured":"Jeff Zhang, Sameh Elnikety, Shuayb Zarar, Atul Gupta, and Siddharth Garg. {Model-Switching}: Dealing with fluctuating workloads in {Machine-Learning-as-a-Service} systems. In 12th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 20), 2020."},{"key":"e_1_3_2_1_86_1","first-page":"41785","volume-title":"International Conference on Machine Learning","author":"Zhang Kexun","unstructured":"Kexun Zhang, Xianjun Yang, William Yang Wang, and Lei Li. Redi: efficient learning-free diffusion inference via trajectory retrieval. In International Conference on Machine Learning, pages 41770\u201341785. PMLR, 2023."},{"key":"e_1_3_2_1_87_1","first-page":"14011","article-title":"Accelerating training of transformer-based language models with progressive layer dropping","volume":"33","author":"Zhang Minjia","year":"2020","unstructured":"Minjia Zhang and Yuxiong He. Accelerating training of transformer-based language models with progressive layer dropping. Advances in Neural Information Processing Systems, 33:14011\u201314023, 2020.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_88_1","volume-title":"Llm-pq: Serving llm on heterogeneous clusters with phase-aware partition and adaptive quantization. arXiv preprint arXiv:2403.01136","author":"Zhao Juntao","year":"2024","unstructured":"Juntao Zhao, Borui Wan, Yanghua Peng, Haibin Lin, and Chuan Wu. Llm-pq: Serving llm on heterogeneous clusters with phase-aware partition and adaptive quantization. arXiv preprint arXiv:2403.01136, 2024."}],"event":{"name":"MIDDLEWARE '25: 26th International Middleware Conference","location":"Vanderbilt University Nashville TN USA","acronym":"MIDDLEWARE '25","sponsor":["IFIP","Usenix"]},"container-title":["Proceedings of the 26th International Middleware Conference"],"original-title":[],"deposited":{"date-parts":[[2025,12,8]],"date-time":"2025-12-08T19:57:23Z","timestamp":1765223843000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3721462.3770765"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,12,14]]},"references-count":88,"alternative-id":["10.1145\/3721462.3770765","10.1145\/3721462"],"URL":"https:\/\/doi.org\/10.1145\/3721462.3770765","relation":{},"subject":[],"published":{"date-parts":[[2025,12,14]]},"assertion":[{"value":"2025-12-14","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}