{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:43:14Z","timestamp":1768030994627,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":106,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,10,8]],"date-time":"2022-10-08T00:00:00Z","timestamp":1665187200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"Facebook Faculty Research Award"},{"name":"Canada Foundation for Innovation"},{"name":"Vector Institute"},{"name":"AWS Machine Learning Research Award"},{"name":"NSERC"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,10,8]]},"DOI":"10.1145\/3559009.3569650","type":"proceedings-article","created":{"date-parts":[[2023,1,27]],"date-time":"2023-01-27T14:02:50Z","timestamp":1674828170000},"page":"317-332","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["GPUPool"],"prefix":"10.1145","author":[{"given":"Xiaodan Serina","family":"Tan","sequence":"first","affiliation":[{"name":"University of Toronto, Toronto, Ontario, Canada"}]},{"given":"Pavel","family":"Golikov","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, Ontario, Canada"}]},{"given":"Nandita","family":"Vijaykumar","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, Ontario, Canada"}]},{"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[{"name":"University of Toronto, Toronto, Ontario, Canada"}]}],"member":"320","published-online":{"date-parts":[[2023,1,27]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"[n. d.]. Amazon EC2 P3 Instances. https:\/\/aws.amazon.com\/ec2\/instance-types\/p3\/.  [n. d.]. Amazon EC2 P3 Instances. https:\/\/aws.amazon.com\/ec2\/instance-types\/p3\/."},{"key":"e_1_3_2_2_2_1","unstructured":"[n. d.]. Amazon WorkSpaces. https:\/\/aws.amazon.com\/workspaces\/?nc=sn&loc=1&workspaces-blogs.sort-by=item.additionalFields.createdDate&workspaces-blogs.sort-order=desc.  [n. d.]. Amazon WorkSpaces. https:\/\/aws.amazon.com\/workspaces\/?nc=sn&loc=1&workspaces-blogs.sort-by=item.additionalFields.createdDate&workspaces-blogs.sort-order=desc."},{"key":"e_1_3_2_2_3_1","unstructured":"[n. d.]. AMD Ryzen Threadripper 1950X Processor. https:\/\/www.amd.com\/en\/products\/cpu\/amd-ryzen-threadripper-1950x.  [n. d.]. AMD Ryzen Threadripper 1950X Processor. https:\/\/www.amd.com\/en\/products\/cpu\/amd-ryzen-threadripper-1950x."},{"key":"e_1_3_2_2_4_1","unstructured":"[n. d.]. Arioc. https:\/\/github.com\/RWilton\/Arioc.  [n. d.]. Arioc. https:\/\/github.com\/RWilton\/Arioc."},{"key":"e_1_3_2_2_5_1","unstructured":"[n. d.]. Clara-Parabricks. https:\/\/www.nvidia.com\/en-us\/healthcare\/clara-parabricks\/.  [n. d.]. Clara-Parabricks. https:\/\/www.nvidia.com\/en-us\/healthcare\/clara-parabricks\/."},{"key":"e_1_3_2_2_6_1","unstructured":"[n. d.]. Cloud GPUs. https:\/\/cloud.google.com\/gpu.  [n. d.]. Cloud GPUs. https:\/\/cloud.google.com\/gpu."},{"key":"e_1_3_2_2_7_1","unstructured":"[n. d.]. CUB. https:\/\/nvlabs.github.io\/cub\/.  [n. d.]. CUB. https:\/\/nvlabs.github.io\/cub\/."},{"key":"e_1_3_2_2_8_1","unstructured":"[n. d.]. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html.  [n. d.]. CUDA C++ Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_2_9_1","unstructured":"[n. d.]. CUDA Occupancy Calculator. https:\/\/docs.nvidia.com\/cuda\/cuda-occupancy-calculator\/index.html.  [n. d.]. CUDA Occupancy Calculator. https:\/\/docs.nvidia.com\/cuda\/cuda-occupancy-calculator\/index.html."},{"key":"e_1_3_2_2_10_1","unstructured":"[n. d.]. CUDA Pro Tip: Minimize the Tail Effect. https:\/\/developer.nvidia.com\/blog\/cuda-pro-tip-minimize-the-tail-effect\/.  [n. d.]. CUDA Pro Tip: Minimize the Tail Effect. https:\/\/developer.nvidia.com\/blog\/cuda-pro-tip-minimize-the-tail-effect\/."},{"key":"e_1_3_2_2_11_1","unstructured":"[n. d.]. CUDA Samples. https:\/\/github.com\/NVIDIA\/cuda-samples.  [n. d.]. CUDA Samples. https:\/\/github.com\/NVIDIA\/cuda-samples."},{"key":"e_1_3_2_2_12_1","unstructured":"[n. d.]. CudaSift. https:\/\/github.com\/Celebrandil\/CudaSift.  [n. d.]. CudaSift. https:\/\/github.com\/Celebrandil\/CudaSift."},{"key":"e_1_3_2_2_13_1","unstructured":"[n. d.]. Cupoch. https:\/\/github.com\/neka-nat\/cupoch.  [n. d.]. Cupoch. https:\/\/github.com\/neka-nat\/cupoch."},{"key":"e_1_3_2_2_14_1","unstructured":"[n. d.]. Cutlass: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass.  [n. d.]. Cutlass: CUDA Templates for Linear Algebra Subroutines. https:\/\/github.com\/NVIDIA\/cutlass."},{"key":"e_1_3_2_2_15_1","unstructured":"[n. d.]. EdmondsBlossom. https:\/\/github.com\/mattkrick\/EdmondsBlossom.  [n. d.]. EdmondsBlossom. https:\/\/github.com\/mattkrick\/EdmondsBlossom."},{"key":"e_1_3_2_2_16_1","unstructured":"[n. d.]. Feature importances with a forest of trees. https:\/\/scikit-learn.org\/stable\/auto_examples\/ensemble\/plot_forest_importances.html.  [n. d.]. Feature importances with a forest of trees. https:\/\/scikit-learn.org\/stable\/auto_examples\/ensemble\/plot_forest_importances.html."},{"key":"e_1_3_2_2_17_1","unstructured":"[n. d.]. Getting Started with CUDA Graphs. https:\/\/devblogs.nvidia.com\/cuda-graphs\/.  [n. d.]. Getting Started with CUDA Graphs. https:\/\/devblogs.nvidia.com\/cuda-graphs\/."},{"key":"e_1_3_2_2_18_1","unstructured":"[n. d.]. GPUs on Compute Engine. https:\/\/cloud.google.com\/compute\/docs\/gpus.  [n. d.]. GPUs on Compute Engine. https:\/\/cloud.google.com\/compute\/docs\/gpus."},{"key":"e_1_3_2_2_19_1","unstructured":"[n. d.]. Graph Analytics. https:\/\/developer.nvidia.com\/discover\/graph-analytics.  [n. d.]. Graph Analytics. https:\/\/developer.nvidia.com\/discover\/graph-analytics."},{"key":"e_1_3_2_2_20_1","unstructured":"[n. d.]. GROMACS User Guide: Getting Good Performance from mdrun. http:\/\/manual.gromacs.org\/documentation\/current\/user-guide\/mdrun-performance.html.  [n. d.]. GROMACS User Guide: Getting Good Performance from mdrun. http:\/\/manual.gromacs.org\/documentation\/current\/user-guide\/mdrun-performance.html."},{"key":"e_1_3_2_2_21_1","unstructured":"[n. d.]. Gunrock. https:\/\/github.com\/gunrock\/gunrock.  [n. d.]. Gunrock. https:\/\/github.com\/gunrock\/gunrock."},{"key":"e_1_3_2_2_22_1","unstructured":"[n. d.]. HOOMD-blue. https:\/\/github.com\/glotzerlab\/hoomd-blue.  [n. d.]. HOOMD-blue. https:\/\/github.com\/glotzerlab\/hoomd-blue."},{"key":"e_1_3_2_2_23_1","unstructured":"[n. d.]. Linux Virtual Machine Pricing. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/virtual-machines\/linux\/.  [n. d.]. Linux Virtual Machine Pricing. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/virtual-machines\/linux\/."},{"key":"e_1_3_2_2_24_1","unstructured":"[n. d.]. MLPerf Benchmark Suite. The MLPerf name and logo are trademarks. See www.mlperf.org for more information. https:\/\/mlperf.org.  [n. d.]. MLPerf Benchmark Suite. The MLPerf name and logo are trademarks. See www.mlperf.org for more information. https:\/\/mlperf.org."},{"key":"e_1_3_2_2_25_1","unstructured":"[n. d.]. MLPerf Inference. https:\/\/mlperf.org\/inference-overview\/.  [n. d.]. MLPerf Inference. https:\/\/mlperf.org\/inference-overview\/."},{"key":"e_1_3_2_2_26_1","unstructured":"[n. d.]. MLPerf Training. https:\/\/mlperf.org\/training-overview\/.  [n. d.]. MLPerf Training. https:\/\/mlperf.org\/training-overview\/."},{"key":"e_1_3_2_2_27_1","unstructured":"[n. d.]. NAMD. https:\/\/www.ks.uiuc.edu\/Research\/namd\/.  [n. d.]. NAMD. https:\/\/www.ks.uiuc.edu\/Research\/namd\/."},{"key":"e_1_3_2_2_28_1","unstructured":"[n. d.]. Nsight Compute. https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html.  [n. d.]. Nsight Compute. https:\/\/docs.nvidia.com\/nsight-compute\/NsightCompute\/index.html."},{"key":"e_1_3_2_2_29_1","unstructured":"[n. d.]. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf.  [n. d.]. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_2_2_30_1","unstructured":"[n. d.]. NVIDIA Clara Parabricks. https:\/\/developer.nvidia.com\/clara-parabricks.  [n. d.]. NVIDIA Clara Parabricks. https:\/\/developer.nvidia.com\/clara-parabricks."},{"key":"e_1_3_2_2_31_1","unstructured":"[n. d.]. Nvidia CUDA Bioinformatics: An Introduction. https:\/\/www.biocentric.nl\/biocentric\/nvidia-cuda-bioinformatics-an-introduction\/.  [n. d.]. Nvidia CUDA Bioinformatics: An Introduction. https:\/\/www.biocentric.nl\/biocentric\/nvidia-cuda-bioinformatics-an-introduction\/."},{"key":"e_1_3_2_2_32_1","unstructured":"[n. d.]. Nvidia cuDNN Documentation. https:\/\/docs.nvidia.com\/deeplearning\/cudnn\/developer-guide\/index.html.  [n. d.]. Nvidia cuDNN Documentation. https:\/\/docs.nvidia.com\/deeplearning\/cudnn\/developer-guide\/index.html."},{"key":"e_1_3_2_2_33_1","unstructured":"[n. d.]. NVIDIA GeForce RTX 2080Ti. https:\/\/www.nvidia.com\/en-us\/geforce\/graphics-cards\/rtx-2080-ti\/.  [n. d.]. NVIDIA GeForce RTX 2080Ti. https:\/\/www.nvidia.com\/en-us\/geforce\/graphics-cards\/rtx-2080-ti\/."},{"key":"e_1_3_2_2_34_1","unstructured":"[n. d.]. NVIDIA Hopper Architecture. https:\/\/www.nvidia.com\/en-us\/technologies\/hopper-architecture\/.  [n. d.]. NVIDIA Hopper Architecture. https:\/\/www.nvidia.com\/en-us\/technologies\/hopper-architecture\/."},{"key":"e_1_3_2_2_35_1","unstructured":"[n. d.]. Nvidia Multi-Instance GPU. https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/.  [n. d.]. Nvidia Multi-Instance GPU. https:\/\/www.nvidia.com\/en-us\/technologies\/multi-instance-gpu\/."},{"key":"e_1_3_2_2_36_1","unstructured":"[n. d.]. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html.  [n. d.]. NVIDIA Multi-Instance GPU User Guide. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html."},{"key":"e_1_3_2_2_37_1","unstructured":"[n. d.]. NVIDIA Tesla V100 GPU Architecture. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf.  [n. d.]. NVIDIA Tesla V100 GPU Architecture. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf."},{"key":"e_1_3_2_2_38_1","unstructured":"[n. d.]. NVIDIA Turing GPU Architecture (Graphics Reinvented). https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/design-visualization\/technologies\/turing-architecture\/NVIDIA-Turing-Architecture-Whitepaper.pdf.  [n. d.]. NVIDIA Turing GPU Architecture (Graphics Reinvented). https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/design-visualization\/technologies\/turing-architecture\/NVIDIA-Turing-Architecture-Whitepaper.pdf."},{"key":"e_1_3_2_2_39_1","unstructured":"[n. d.]. Nvidia Virtual GPU Software Documentation. https:\/\/docs.nvidia.com\/grid\/latest\/index.html.  [n. d.]. Nvidia Virtual GPU Software Documentation. https:\/\/docs.nvidia.com\/grid\/latest\/index.html."},{"key":"e_1_3_2_2_40_1","unstructured":"[n. d.]. Profiler User's Guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html.  [n. d.]. Profiler User's Guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html."},{"key":"e_1_3_2_2_41_1","unstructured":"[n. d.]. pytorch-vision-resnet. https:\/\/github.com\/pytorch\/vision\/blob\/master\/torchvision\/models\/resnet.py.  [n. d.]. pytorch-vision-resnet. https:\/\/github.com\/pytorch\/vision\/blob\/master\/torchvision\/models\/resnet.py."},{"key":"e_1_3_2_2_42_1","unstructured":"[n. d.]. Reduce the costs of ML workflows with preemptible VMs and GPUs. https:\/\/cloud.google.com\/blog\/products\/ai-machine-learning\/reduce-the-costs-of-ml-workflows-with-preemptible-vms-and-gpus.  [n. d.]. Reduce the costs of ML workflows with preemptible VMs and GPUs. https:\/\/cloud.google.com\/blog\/products\/ai-machine-learning\/reduce-the-costs-of-ml-workflows-with-preemptible-vms-and-gpus."},{"key":"e_1_3_2_2_43_1","unstructured":"[n. d.]. sklearn.ensemble.GradientBoostingRegressor. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.ensemble.GradientBoostingRegressor.html.  [n. d.]. sklearn.ensemble.GradientBoostingRegressor. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.ensemble.GradientBoostingRegressor.html."},{"key":"e_1_3_2_2_44_1","unstructured":"[n. d.]. sklearn.linear_model.LinearRegression. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.linear_model.LinearRegression.html.  [n. d.]. sklearn.linear_model.LinearRegression. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.linear_model.LinearRegression.html."},{"key":"e_1_3_2_2_45_1","unstructured":"[n. d.]. sklearn.neural_network.MLPRegressor. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.neural_network.MLPRegressor.html.  [n. d.]. sklearn.neural_network.MLPRegressor. https:\/\/scikit-learn.org\/stable\/modules\/generated\/sklearn.neural_network.MLPRegressor.html."},{"key":"e_1_3_2_2_46_1","unstructured":"[n. d.]. SM7_QV100. https:\/\/github.com\/gpgpu-sim\/gpgpu-sim_distribution\/tree\/dev\/configs\/tested-cfgs\/SM7_QV100.  [n. d.]. SM7_QV100. https:\/\/github.com\/gpgpu-sim\/gpgpu-sim_distribution\/tree\/dev\/configs\/tested-cfgs\/SM7_QV100."},{"key":"e_1_3_2_2_47_1","unstructured":"[n. d.]. Virtual Graphics. https:\/\/www.amd.com\/en\/graphics\/workstation-virtual-graphics.  [n. d.]. Virtual Graphics. https:\/\/www.amd.com\/en\/graphics\/workstation-virtual-graphics."},{"key":"e_1_3_2_2_48_1","unstructured":"2015. GPU Pro Tip: CUDA 7 Streams Simplify Concurrency. https:\/\/developer.nvidia.com\/blog\/gpu-pro-tip-cuda-7-streams-simplify-concurrency\/.  2015. GPU Pro Tip: CUDA 7 Streams Simplify Concurrency. https:\/\/developer.nvidia.com\/blog\/gpu-pro-tip-cuda-7-streams-simplify-concurrency\/."},{"key":"e_1_3_2_2_49_1","unstructured":"2020. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf.  2020. Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf."},{"key":"e_1_3_2_2_50_1","volume-title":"Machine Learning-based Interference Detection in GPGPU Concurrent Kernel Execution. In 2020 25th International Computer Conference, Computer Society of Iran (CSICC). IEEE, 1--4.","author":"Alizadeh Negar Sadat","year":"2020","unstructured":"Negar Sadat Alizadeh and Mahmoud Momtazpour . 2020 . Machine Learning-based Interference Detection in GPGPU Concurrent Kernel Execution. In 2020 25th International Computer Conference, Computer Society of Iran (CSICC). IEEE, 1--4. Negar Sadat Alizadeh and Mahmoud Momtazpour. 2020. Machine Learning-based Interference Detection in GPGPU Concurrent Kernel Execution. In 2020 25th International Computer Conference, Computer Society of Iran (CSICC). IEEE, 1--4."},{"key":"e_1_3_2_2_51_1","volume-title":"Slate: Enabling Workload-Aware Efficient Multiprocessing for Modern GPGPUs. In 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 252--261","author":"Allen Tyler","year":"2019","unstructured":"Tyler Allen , Xizhou Feng , and Rong Ge . 2019 . Slate: Enabling Workload-Aware Efficient Multiprocessing for Modern GPGPUs. In 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 252--261 . Tyler Allen, Xizhou Feng, and Rong Ge. 2019. Slate: Enabling Workload-Aware Efficient Multiprocessing for Modern GPGPUs. In 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS). IEEE, 252--261."},{"key":"e_1_3_2_2_52_1","volume-title":"MASK: Redesigning the GPU Memory Hierarchy to Support Multi-Application Concurrency.","author":"Ausavarungnirun Rachata","year":"2018","unstructured":"Rachata Ausavarungnirun , Vance Miller , Joshua Landgraf , Saugata Ghose , Jayneel Gandhi , Adwait Jog , Christopher J. Rossbach , and Onur Mutlu . 2018 . MASK: Redesigning the GPU Memory Hierarchy to Support Multi-Application Concurrency. (2018). Rachata Ausavarungnirun, Vance Miller, Joshua Landgraf, Saugata Ghose, Jayneel Gandhi, Adwait Jog, Christopher J. Rossbach, and Onur Mutlu. 2018. MASK: Redesigning the GPU Memory Hierarchy to Support Multi-Application Concurrency. (2018)."},{"key":"e_1_3_2_2_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"e_1_3_2_2_54_1","volume-title":"Adaptive Simultaneous Multi-tenancy for GPUs. In Workshop on Job Scheduling Strategies for Parallel Processing. Springer, 83--106","author":"Bashizade Ramin","year":"2018","unstructured":"Ramin Bashizade , Yuxuan Li , and Alvin R Lebeck . 2018 . Adaptive Simultaneous Multi-tenancy for GPUs. In Workshop on Job Scheduling Strategies for Parallel Processing. Springer, 83--106 . Ramin Bashizade, Yuxuan Li, and Alvin R Lebeck. 2018. Adaptive Simultaneous Multi-tenancy for GPUs. In Workshop on Job Scheduling Strategies for Parallel Processing. Springer, 83--106."},{"key":"e_1_3_2_2_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2010.5650274"},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037700"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/2954679.2872368"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1016\/0885-064X(87)90009-4"},{"key":"e_1_3_2_2_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00027"},{"key":"e_1_3_2_2_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/2556583"},{"key":"e_1_3_2_2_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541941"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"publisher","DOI":"10.5555\/645607.661350"},{"key":"e_1_3_2_2_63_1","volume-title":"trees, and flowers. Canadian Journal of mathematics 17","author":"Edmonds Jack","year":"1965","unstructured":"Jack Edmonds . 1965. Paths , trees, and flowers. Canadian Journal of mathematics 17 ( 1965 ), 449--467. Jack Edmonds. 1965. Paths, trees, and flowers. Canadian Journal of mathematics 17 (1965), 449--467."},{"key":"e_1_3_2_2_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2008.44"},{"key":"e_1_3_2_2_65_1","unstructured":"Jerome H Friedman. 1999. Stochastic gradient boosting. mh (x; am) 1000: 0.  Jerome H Friedman. 1999. Stochastic gradient boosting. mh (x; am) 1000: 0."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_67_1","volume-title":"Dissecting the NVIDIA volta GPU architecture via microbenchmarking. arXiv preprint arXiv:1804.06826","author":"Jia Zhe","year":"2018","unstructured":"Zhe Jia , Marco Maggioni , Benjamin Staiger , and Daniele P Scarpazza . 2018. Dissecting the NVIDIA volta GPU architecture via microbenchmarking. arXiv preprint arXiv:1804.06826 ( 2018 ). Zhe Jia, Marco Maggioni, Benjamin Staiger, and Daniele P Scarpazza. 2018. Dissecting the NVIDIA volta GPU architecture via microbenchmarking. arXiv preprint arXiv:1804.06826 (2018)."},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPA\/IUCC.2017.00087"},{"key":"e_1_3_2_2_69_1","doi-asserted-by":"publisher","DOI":"10.1145\/2818950.2818979"},{"key":"e_1_3_2_2_70_1","volume-title":"Exploring modern GPU memory system design challenges through accurate modeling. arXiv preprint arXiv:1810.07269","author":"Khairy Mahmoud","year":"2018","unstructured":"Mahmoud Khairy , Jain Akshay , Tor Aamodt , and Timothy G Rogers . 2018. Exploring modern GPU memory system design challenges through accurate modeling. arXiv preprint arXiv:1810.07269 ( 2018 ). Mahmoud Khairy, Jain Akshay, Tor Aamodt, and Timothy G Rogers. 2018. Exploring modern GPU memory system design challenges through accurate modeling. arXiv preprint arXiv:1810.07269 (2018)."},{"key":"e_1_3_2_2_71_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACSOS-C51401.2020.00023"},{"key":"e_1_3_2_2_72_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105.  Alex Krizhevsky Ilya Sutskever and Geoffrey E Hinton. 2012. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 1097--1105."},{"key":"e_1_3_2_2_73_1","volume-title":"Coordinated CTA combination and bandwidth partitioning for GPU concurrent kernel execution. ACM Transactions on Architecture and Code Optimization (TACO'19) 16, 3","author":"Lin Zhen","year":"2019","unstructured":"Zhen Lin , Hongwen Dai , Michael Mantor , and Huiyang Zhou . 2019. Coordinated CTA combination and bandwidth partitioning for GPU concurrent kernel execution. ACM Transactions on Architecture and Code Optimization (TACO'19) 16, 3 ( 2019 ), 1--27. Zhen Lin, Hongwen Dai, Michael Mantor, and Huiyang Zhou. 2019. Coordinated CTA combination and bandwidth partitioning for GPU concurrent kernel execution. ACM Transactions on Architecture and Code Optimization (TACO'19) 16, 3 (2019), 1--27."},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00024"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.2983731"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155650"},{"key":"e_1_3_2_2_77_1","volume-title":"Research design and statistical analysis","author":"Myers Jerome L","unstructured":"Jerome L Myers , Arnold Well , and Robert Frederick Lorch . 2010. Research design and statistical analysis . Routledge . Jerome L Myers, Arnold Well, and Robert Frederick Lorch. 2010. Research design and statistical analysis. Routledge."},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/2628071.2628117"},{"key":"e_1_3_2_2_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037707"},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446064"},{"key":"e_1_3_2_2_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835965"},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00016"},{"key":"e_1_3_2_2_83_1","doi-asserted-by":"publisher","DOI":"10.1145\/1996130.1996160"},{"key":"e_1_3_2_2_84_1","volume-title":"cCUDA: Effective Co-Scheduling of Concurrent Kernels on GPUs","author":"Shekofteh Seyed Kazem","year":"2019","unstructured":"Seyed Kazem Shekofteh , Hamid Noori , Mahmoud Naghibzadeh , Holger Froening , and Hadi Sadoghi Yazdi . 2019. cCUDA: Effective Co-Scheduling of Concurrent Kernels on GPUs . IEEE Transactions on Parallel and Distributed Systems ( 2019 ). Seyed Kazem Shekofteh, Hamid Noori, Mahmoud Naghibzadeh, Holger Froening, and Hadi Sadoghi Yazdi. 2019. cCUDA: Effective Co-Scheduling of Concurrent Kernels on GPUs. IEEE Transactions on Parallel and Distributed Systems (2019)."},{"key":"e_1_3_2_2_85_1","volume-title":"Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing.","author":"Stratton John","year":"2012","unstructured":"John Stratton , Christopher Rodrigues , Sung I- Jui , Nady Obeid , Li-Wen Chang , Nasser Anssari , Geng Liu , and Wen-mei Hwu. 2012 . Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. (2012). John Stratton, Christopher Rodrigues, Sung I-Jui, Nady Obeid, Li-Wen Chang, Nasser Anssari, Geng Liu, and Wen-mei Hwu. 2012. Parboil: A Revised Benchmark Suite for Scientific and Commercial Throughput Computing. (2012)."},{"key":"e_1_3_2_2_86_1","volume-title":"SMQoS: Improving Utilization and Energy Efficiency with QoS Awareness on GPUs. In 2019 IEEE International Conference on Cluster Computing (CLUSTER'19)","author":"Sun Qingxiao","year":"2019","unstructured":"Qingxiao Sun , Yi Liu , Hailong Yang , Zhongzhi Luan , and Depei Qian . 2019 . SMQoS: Improving Utilization and Energy Efficiency with QoS Awareness on GPUs. In 2019 IEEE International Conference on Cluster Computing (CLUSTER'19) . IEEE, 1--5. Qingxiao Sun, Yi Liu, Hailong Yang, Zhongzhi Luan, and Depei Qian. 2019. SMQoS: Improving Utilization and Energy Efficiency with QoS Awareness on GPUs. In 2019 IEEE International Conference on Cluster Computing (CLUSTER'19). IEEE, 1--5."},{"key":"e_1_3_2_2_87_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.73"},{"key":"e_1_3_2_2_88_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783718"},{"key":"e_1_3_2_2_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00030"},{"key":"e_1_3_2_2_90_1","doi-asserted-by":"publisher","DOI":"10.1190\/geo2017-0624.1"},{"key":"e_1_3_2_2_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"e_1_3_2_2_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080203"},{"key":"e_1_3_2_2_93_1","volume-title":"Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18). 595--610.","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao , Romil Bhardwaj , Ramachandran Ramjee , Muthian Sivathanu , Nipun Kwatra , Zhenhua Han , Pratyush Patel , Xuan Peng , Hanyu Zhao , Quanlu Zhang , 2018 . Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18). 595--610. Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, Quanlu Zhang, et al. 2018. Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18). 595--610."},{"key":"e_1_3_2_2_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001161"},{"key":"e_1_3_2_2_95_1","doi-asserted-by":"publisher","DOI":"10.1145\/3155284.3018754"},{"key":"e_1_3_2_2_96_1","volume-title":"Deadline-Aware Offloading for High-Throughput Accelerators. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 479--492","author":"Yeh Tsung Tai","year":"2021","unstructured":"Tsung Tai Yeh , Matthew D Sinclair , Bradford M Beckmann , and Timothy G Rogers . 2021 . Deadline-Aware Offloading for High-Throughput Accelerators. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 479--492 . Tsung Tai Yeh, Matthew D Sinclair, Bradford M Beckmann, and Timothy G Rogers. 2021. Deadline-Aware Offloading for High-Throughput Accelerators. In 2021 IEEE International Symposium on High-Performance Computer Architecture (HPCA). IEEE, 479--492."},{"key":"e_1_3_2_2_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378466"},{"key":"e_1_3_2_2_98_1","volume-title":"Salus: Fine-Grained GPU Sharing Among CNN Applications.","author":"Yu Peifeng","year":"2020","unstructured":"Peifeng Yu and Mosharaf Chowdhury . 2020 . Salus: Fine-Grained GPU Sharing Among CNN Applications. (2020). Peifeng Yu and Mosharaf Chowdhury. 2020. Salus: Fine-Grained GPU Sharing Among CNN Applications. (2020)."},{"key":"e_1_3_2_2_99_1","volume-title":"G-net: Effective {GPU} sharing in {NFV} systems. In 15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18). 187--200.","author":"Zhang Kai","year":"2018","unstructured":"Kai Zhang , Bingsheng He , Jiayu Hu , Zeke Wang , Bei Hua , Jiayi Meng , and Lishan Yang . 2018 . G-net: Effective {GPU} sharing in {NFV} systems. In 15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18). 187--200. Kai Zhang, Bingsheng He, Jiayu Hu, Zeke Wang, Bei Hua, Jiayi Meng, and Lishan Yang. 2018. G-net: Effective {GPU} sharing in {NFV} systems. In 15th {USENIX} Symposium on Networked Systems Design and Implementation ({NSDI} 18). 187--200."},{"key":"e_1_3_2_2_100_1","doi-asserted-by":"publisher","DOI":"10.1145\/3330345.3330351"},{"key":"e_1_3_2_2_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.53"},{"key":"e_1_3_2_2_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2018.2851207"},{"key":"e_1_3_2_2_103_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00074"},{"key":"e_1_3_2_2_104_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378457"},{"key":"e_1_3_2_2_105_1","doi-asserted-by":"publisher","DOI":"10.1145\/3205289.3205311"},{"key":"e_1_3_2_2_106_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2013.257"}],"event":{"name":"PACT '22: International Conference on Parallel Architectures and Compilation Techniques","location":"Chicago Illinois","acronym":"PACT '22","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IFIP WG 10.3 IFIP WG 10.3","IEEE CS"]},"container-title":["Proceedings of the International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3559009.3569650","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3559009.3569650","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:38Z","timestamp":1750186958000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3559009.3569650"}},"subtitle":["A Holistic Approach to Fine-Grained GPU Sharing in the Cloud"],"short-title":[],"issued":{"date-parts":[[2022,10,8]]},"references-count":106,"alternative-id":["10.1145\/3559009.3569650","10.1145\/3559009"],"URL":"https:\/\/doi.org\/10.1145\/3559009.3569650","relation":{},"subject":[],"published":{"date-parts":[[2022,10,8]]},"assertion":[{"value":"2023-01-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}