{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T17:32:39Z","timestamp":1777138359942,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62022057, 61832006, 61632017, 61872240"],"award-info":[{"award-number":["62022057, 61832006, 61632017, 61872240"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3476143","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-15","source":"Crossref","is-referenced-by-count":57,"title":["Enable simultaneous DNN services based on deterministic operator overlap and precise latency prediction"],"prefix":"10.1145","author":[{"given":"Weihao","family":"Cui","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Han","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Quan","family":"Chen","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ningxin","family":"Zheng","sequence":"additional","affiliation":[{"name":"Microsoft Research Asia"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jingwen","family":"Leng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jieru","family":"Zhao","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhuo","family":"Song","sequence":"additional","affiliation":[{"name":"Alibaba Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tao","family":"Ma","sequence":"additional","affiliation":[{"name":"Alibaba Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yong","family":"Yang","sequence":"additional","affiliation":[{"name":"Alibaba Cloud"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chao","family":"Li","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2021. cgroups. https:\/\/www.kernel.org\/doc\/Documentation\/cgroup-v2.txt.  2021. cgroups. https:\/\/www.kernel.org\/doc\/Documentation\/cgroup-v2.txt."},{"key":"e_1_3_2_2_2_1","unstructured":"2021. memcached. https:\/\/memcached.org.  2021. memcached. https:\/\/memcached.org."},{"key":"e_1_3_2_2_3_1","volume-title":"A multi-neural network acceleration architecture","author":"Baek Eunjin"},{"key":"e_1_3_2_2_4_1","unstructured":"Zhihao Bai Zhen Zhang Yibo Zhu and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In OSDI. 499--514.  Zhihao Bai Zhen Zhang Yibo Zhu and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In OSDI. 499--514."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1961189.1961199"},{"key":"e_1_3_2_2_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330370"},{"key":"e_1_3_2_2_7_1","volume-title":"Jason Mars, and Lingjia Tang.","author":"Chen Quan","year":"2017"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2872362.2872368"},{"key":"e_1_3_2_2_9_1","volume-title":"Parties: Qos-aware resource partitioning for multiple interactive services. In ASPLOS. 107--120.","author":"Chen Shuang","year":"2019"},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_2_11_1","volume-title":"Prema: A predictive multi-task scheduling algorithm for preemptible neural processing units","author":"Choi Yujeong","year":"2020"},{"key":"e_1_3_2_2_12_1","volume-title":"Inferline: Ml inference pipeline composition framework. arXiv preprint arXiv:1812.01776","author":"Crankshaw Daniel","year":"2018"},{"key":"e_1_3_2_2_13_1","volume-title":"Clipper: A low-latency online prediction serving system. In NSDI. 613--627.","author":"Crankshaw Daniel","year":"2017"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3047638"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD46524.2019.00075"},{"key":"e_1_3_2_2_16_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018"},{"key":"e_1_3_2_2_17_1","unstructured":"Arpan Gujarati Reza Karimi Safya Alzayat Wei Hao Antoine Kaufmann Ymir Vigfusson and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In OSDI. 443--462.  Arpan Gujarati Reza Karimi Safya Alzayat Wei Hao Antoine Kaufmann Ymir Vigfusson and Jonathan Mace. 2020. Serving DNNs like Clockwork: Performance Predictability from the Bottom Up. In OSDI. 443--462."},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433722"},{"key":"e_1_3_2_2_19_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778.  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2016. Deep residual learning for image recognition. In CVPR. 770--778."},{"key":"e_1_3_2_2_20_1","volume-title":"Long short-term memory. Neural computation 9, 8","author":"Hochreiter Sepp","year":"1997"},{"key":"e_1_3_2_2_21_1","volume-title":"Tailbench: a benchmark suite and evaluation methodology for latency-critical applications","author":"Kasture Harshad"},{"key":"e_1_3_2_2_22_1","unstructured":"Kubernetes. 2021. Kubernetes. https:\/\/kubernetes.io.  Kubernetes. 2021. Kubernetes. https:\/\/kubernetes.io."},{"key":"e_1_3_2_2_23_1","volume-title":"Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In OSDI. 881--897.","author":"Ma Lingxiao","year":"2020"},{"key":"e_1_3_2_2_24_1","unstructured":"NVIDIA. 2021. CUDA C\/C++ Streams and Concurrency. https:\/\/developer.download.nvidia.com\/CUDA\/training\/StreamsAndConcurrencyWebinar.pdf.  NVIDIA. 2021. CUDA C\/C++ Streams and Concurrency. https:\/\/developer.download.nvidia.com\/CUDA\/training\/StreamsAndConcurrencyWebinar.pdf."},{"key":"e_1_3_2_2_25_1","unstructured":"Nvidia. 2021. Multi-Instance GPU. https:\/\/docs.nvidia.com\/cuda\/mig\/index.html.  Nvidia. 2021. Multi-Instance GPU. https:\/\/docs.nvidia.com\/cuda\/mig\/index.html."},{"key":"e_1_3_2_2_26_1","unstructured":"NVIDIA. 2021. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf.  NVIDIA. 2021. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf."},{"key":"e_1_3_2_2_27_1","unstructured":"NVIDIA. 2021. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf.  NVIDIA. 2021. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_2_2_28_1","unstructured":"NVIDIA. 2021. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-Compute.  NVIDIA. 2021. NVIDIA Nsight Compute. https:\/\/developer.nvidia.com\/nsight-Compute."},{"key":"e_1_3_2_2_29_1","unstructured":"NVIDIA. 2021. NVIDIA Triton Inference Server. https:\/\/github.com\/NVIDIA\/triton-inference-server.  NVIDIA. 2021. NVIDIA Triton Inference Server. https:\/\/github.com\/NVIDIA\/triton-inference-server."},{"key":"e_1_3_2_2_30_1","unstructured":"NVIDIA. 2021. Profiler User's Guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html.  NVIDIA. 2021. Profiler User's Guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html."},{"key":"e_1_3_2_2_31_1","unstructured":"NVIDIA. 2021. TensorRT. https:\/\/developer.nvidia.com\/tensorrt.  NVIDIA. 2021. TensorRT. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_2_32_1","volume-title":"Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint arXiv:1712.06139","author":"Olston Christopher","year":"2017"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3023997"},{"key":"e_1_3_2_2_34_1","volume-title":"Sturgeon: Preference-aware Co-location for Improving Utilization of Power Constrained Computers. In 2020 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 718--727","author":"Pang Pu","year":"2020"},{"key":"e_1_3_2_2_35_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703","author":"Paszke Adam","year":"2019"},{"key":"e_1_3_2_2_36_1","volume-title":"Clite: Efficient and qos-aware co-location of multiple latency-critical jobs for warehouse scale computers","author":"Patel Tirthak","year":"2020"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356164"},{"key":"e_1_3_2_2_38_1","volume-title":"Dilip Sequeira, Ashish Sirasao, Fei Sun, Hanlin Tang, Michael Thomson, Frank Wei, Ephrem Wu, Lingjie Xu, Koichi Yamada, Bing Yu, George Yuan, Aaron Zhong, Peizhao Zhang, and Yuchen Zhou.","author":"Reddi Vijay Janapa","year":"2019"},{"key":"e_1_3_2_2_39_1","volume-title":"Yolov3: An incremental improvement. arXiv preprint arXiv:1804.02767","author":"Redmon Joseph","year":"2018"},{"key":"e_1_3_2_2_40_1","volume-title":"INFaaS: A Model-less Inference Serving System. arXiv preprint arXiv:1905.13348","author":"Romero Francisco","year":"2019"},{"key":"e_1_3_2_2_41_1","volume-title":"Scale-sim: Systolic cnn accelerator simulator. arXiv preprint arXiv:1811.02883","author":"Samajdar Ananda","year":"2018"},{"key":"e_1_3_2_2_42_1","volume-title":"Linear regression analysis","author":"Seber George AF"},{"key":"e_1_3_2_2_43_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Shahrad Mohammad","year":"2020"},{"key":"e_1_3_2_2_44_1","doi-asserted-by":"crossref","unstructured":"Haichen Shen Lequn Chen Yuchen Jin Liangyu Zhao Bingyu Kong Matthai Philipose Arvind Krishnamurthy and Ravi Sundaram. 2019. Nexus: a GPU cluster engine for accelerating DNN-based video analysis. In SOSP. 322--337.  Haichen Shen Lequn Chen Yuchen Jin Liangyu Zhao Bingyu Kong Matthai Philipose Arvind Krishnamurthy and Ravi Sundaram. 2019. Nexus: a GPU cluster engine for accelerating DNN-based video analysis. In SOSP. 322--337.","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"crossref","unstructured":"Abhinav Shrivastava Abhinav Gupta and Ross Girshick. 2016. Training region-based object detectors with online hard example mining. In CVPR. 761--769.  Abhinav Shrivastava Abhinav Gupta and Ross Girshick. 2016. Training region-based object detectors with online hard example mining. In CVPR. 761--769.","DOI":"10.1109\/CVPR.2016.89"},{"key":"e_1_3_2_2_46_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Vincent Vanhoucke Sergey Ioffe Jon Shlens and Zbigniew Wojna. 2016. Rethinking the inception architecture for computer vision. In CVPR. 2818--2826.  Christian Szegedy Vincent Vanhoucke Sergey Ioffe Jon Shlens and Zbigniew Wojna. 2016. Rethinking the inception architecture for computer vision. In CVPR. 2818--2826.","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2019.8891040"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3269206.3271739"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2020.03.009"},{"key":"e_1_3_2_2_51_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Zhang Chengliang","year":"2019"},{"key":"e_1_3_2_2_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2021.3064352"},{"key":"e_1_3_2_2_53_1","volume-title":"Bo Wu, Chao Li, and Minyi Guo.","author":"Zhang Wei","year":"2019"},{"key":"e_1_3_2_2_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404397.3404451"},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446693"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3291058"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS47774.2020.00069"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2018.2851207"},{"key":"e_1_3_2_2_59_1","volume-title":"Themis: Predicting and reining in application-level slowdown on spatial multitasking GPUs","author":"Zhao Wenyi","year":"2019"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378457"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476143","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3476143","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T17:49:06Z","timestamp":1750268946000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3476143"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":60,"alternative-id":["10.1145\/3458817.3476143","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3476143","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}