{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,2]],"date-time":"2026-07-02T23:42:29Z","timestamp":1783035749134,"version":"3.54.6"},"publisher-location":"New York, NY, USA","reference-count":74,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,11,13]],"date-time":"2021-11-13T00:00:00Z","timestamp":1636761600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"name":"University of Sydney"},{"name":"NSF","award":["1815643, 1955650, 2047521"],"award-info":[{"award-number":["1815643, 1955650, 2047521"]}]},{"name":"Australia Research Council (ARC)","award":["DP210101984"],"award-info":[{"award-number":["DP210101984"]}]},{"name":"U.S. Dept. of Energy's Office of Science Center for Advanced Technology Evaluation (CENATE)"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021,11,14]]},"DOI":"10.1145\/3458817.3480853","type":"proceedings-article","created":{"date-parts":[[2021,10,21]],"date-time":"2021-10-21T05:10:34Z","timestamp":1634793034000},"page":"1-14","source":"Crossref","is-referenced-by-count":12,"title":["MAPA"],"prefix":"10.1145","author":[{"given":"Kiran","family":"Ranganath","sequence":"first","affiliation":[{"name":"University of California Riverside"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joshua D.","family":"Suetterlein","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Joseph B.","family":"Manzano","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Lab"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Shuaiwen Leon","family":"Song","sequence":"additional","affiliation":[{"name":"University of Sydney, Sydney, Australia"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Daniel","family":"Wong","sequence":"additional","affiliation":[{"name":"University of California Riverside"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,11,13]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"2021. Nvidia Docker Containers. https:\/\/docs.nvidia.com\/datacenter\/cloud-native\/container-toolkit\/install-guide.html  2021. Nvidia Docker Containers. https:\/\/docs.nvidia.com\/datacenter\/cloud-native\/container-toolkit\/install-guide.html"},{"key":"e_1_3_2_2_2_1","unstructured":"2021. NVIDIA Multi-instance GPU. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html  2021. NVIDIA Multi-instance GPU. https:\/\/docs.nvidia.com\/datacenter\/tesla\/mig-user-guide\/index.html"},{"key":"e_1_3_2_2_3_1","volume-title":"2013 46th Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO). 111--122","author":"Abdel-Majeed Mohammad","year":"2013"},{"key":"e_1_3_2_2_4_1","volume-title":"Proceedings of the 2016 International Conference on Supercomputing (ICS '16)","author":"Abdel-Majeed Mohammad","year":"2016"},{"key":"e_1_3_2_2_5_1","volume-title":"2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA). IEEE, 333--346","author":"Abdolrashidi AmirAli","year":"2021"},{"key":"e_1_3_2_2_6_1","volume-title":"Proceedings of the 50th Annual IEEE\/ACM International Symposium on Microarchitecture. 600--611","author":"Abdolrashidi AmirAli","year":"2017"},{"key":"e_1_3_2_2_7_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1--12","author":"Amaral Marcelo","year":"2017"},{"key":"e_1_3_2_2_8_1","unstructured":"Amazon. 2019. Amazon EC2 elastic GPUs. https:\/\/aws.amazon.com\/ec2\/elastic-gpus\/ Accessed 04-20-2019.  Amazon. 2019. Amazon EC2 elastic GPUs. https:\/\/aws.amazon.com\/ec2\/elastic-gpus\/ Accessed 04-20-2019."},{"key":"e_1_3_2_2_9_1","unstructured":"Advanced Micro Devices (AMD). 2021. ROCm Communication Collectives Library. https:\/\/github.com\/ROCmSoftwarePlatform\/rccl  Advanced Micro Devices (AMD). 2021. ROCm Communication Collectives Library. https:\/\/github.com\/ROCmSoftwarePlatform\/rccl"},{"key":"e_1_3_2_2_10_1","volume-title":"Proceedings of the 23rd European MPI Users' Group Meeting. 15--22","author":"Awan Ammar Ahmad","year":"2016"},{"key":"e_1_3_2_2_11_1","volume-title":"Machine learning climate model dynamics: Offline versus online performance. arXiv preprint arXiv:2011.03081","author":"Brenowitz Noah D","year":"2020"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2696940"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"crossref","unstructured":"J. Deng W. Dong R. Socher L.-J. Li K. Li and L. Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR09.  J. Deng W. Dong R. Socher L.-J. Li K. Li and L. Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. In CVPR09.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_2_14_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1--15","author":"Dong Wenqian","year":"2020"},{"key":"e_1_3_2_2_15_1","volume-title":"2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). IEEE, 1022--1035","author":"Du\u0163u Alexandru","year":"2020"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"crossref","first-page":"129","DOI":"10.1504\/IJHPCN.2012.046370","article-title":"Algorithmic skeletons for multicore, multi-GPU systems and clusters","volume":"7","author":"Ernsting Steffen","year":"2012","journal-title":"International Journal of High Performance Computing and Networking"},{"key":"e_1_3_2_2_17_1","unstructured":"Facebook. 2018. Facebook Flexible GPU Expander Big Basin Refresh. https:\/\/www.opencompute.org\/files\/OCP2018-Facebook-Flexible-GPU-Expander-Big-Basin-Refresh-v0.7.pdf  Facebook. 2018. Facebook Flexible GPU Expander Big Basin Refresh. https:\/\/www.opencompute.org\/files\/OCP2018-Facebook-Flexible-GPU-Expander-Big-Basin-Refresh-v0.7.pdf"},{"key":"e_1_3_2_2_18_1","volume-title":"2016 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, 712--720","author":"Faraji Iman","year":"2016"},{"key":"e_1_3_2_2_19_1","volume-title":"2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW). IEEE, 422--429","author":"Fox Geoffrey","year":"2019"},{"key":"e_1_3_2_2_20_1","volume-title":"Google Cloud: Cloud GPUs. https:\/\/cloud.google.com\/gpu Accessed 04-16-2020.","year":"2020"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"crossref","first-page":"78","DOI":"10.1109\/MCSE.2018.021651341","article-title":"Stepping up to Summit","volume":"20","author":"Hines Jonathan","year":"2018","journal-title":"Computing in Science & Engineering"},{"key":"e_1_3_2_2_22_1","unstructured":"NVIDIA Inc. 2019. NVIDIA DGX-1: The essential instrument for AIResearch: Spec Sheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-1\/dgx-1-rhel-datasheet-nvidia-us-808336-r3-web.pdf  NVIDIA Inc. 2019. NVIDIA DGX-1: The essential instrument for AIResearch: Spec Sheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-1\/dgx-1-rhel-datasheet-nvidia-us-808336-r3-web.pdf"},{"key":"e_1_3_2_2_23_1","unstructured":"NVIDIA Inc. 2019. NVIDIA DGX-2: The World Most Powerful Deep Learning System For the Most Complex AI Challenges: Spec Sheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-1\/dgx-2-datasheet-us-nvidia-955420-r2-web-new.pdf  NVIDIA Inc. 2019. NVIDIA DGX-2: The World Most Powerful Deep Learning System For the Most Complex AI Challenges: Spec Sheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-1\/dgx-2-datasheet-us-nvidia-955420-r2-web-new.pdf"},{"key":"e_1_3_2_2_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.3023723"},{"key":"e_1_3_2_2_25_1","volume-title":"Proceedings of the Fifteenth European Conference on Computer Systems","author":"Jamshidi Kasra","year":"2020"},{"key":"e_1_3_2_2_27_1","volume-title":"Proceedings of the 22nd ACM international conference on Multimedia. 675--678","author":"Jia Yangqing","year":"2014"},{"key":"e_1_3_2_2_28_1","volume-title":"Proceedings of the 22Nd ACM International Conference on Multimedia","author":"Jia Yangqing","year":"2014"},{"key":"e_1_3_2_2_29_1","volume-title":"Proceedings of the 44th annual international symposium on computer architecture. 1--12","author":"Jouppi Norman P","year":"2017"},{"key":"e_1_3_2_2_30_1","volume-title":"International Conference on Computational Science. Springer, 116--130","author":"Kadupitiya JCS","year":"2019"},{"key":"e_1_3_2_2_31_1","doi-asserted-by":"crossref","first-page":"357","DOI":"10.1177\/1094342019899457","article-title":"Machine learning for parameter auto-tuning in molecular dynamics simulations: Efficient dynamics of ions near polarizable nanoparticles","volume":"34","author":"Kadupitiya JCS","year":"2020","journal-title":"The International Journal of High Performance Computing Applications"},{"key":"e_1_3_2_2_32_1","volume-title":"Proceedings of the 2018 International Conference on Computing and Artificial Intelligence","author":"Khan Riaz Ullah","year":"2018"},{"key":"e_1_3_2_2_33_1","volume-title":"Proceedings of the 48th International Conference on Parallel Processing. 1--10","author":"Kobus Robin","year":"2019"},{"key":"e_1_3_2_2_34_1","volume-title":"Advances in Neural Information Processing Systems 25","author":"Krizhevsky Alex"},{"key":"e_1_3_2_2_35_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. 1--30","author":"Li Ang","year":"2019"},{"key":"e_1_3_2_2_36_1","volume-title":"Proceedings of the 2018 International Conference on Supercomputing. 53--64","author":"Li Ang","year":"2018"},{"key":"e_1_3_2_2_37_1","unstructured":"A. Li S. Song J. Chen J. Li X. Liu N. Tallent and K. J. Barker. 2019. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. IEEE Transactions on Parallel and Distributed Systems (2019). 10.1109\/TPDS.2019.2928289  A. Li S. Song J. Chen J. Li X. Liu N. Tallent and K. J. Barker. 2019. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. IEEE Transactions on Parallel and Distributed Systems (2019). 10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_2_38_1","volume-title":"Jieyang Chen, Jiajia Li, Xu Liu, Nathan R. Tallent, and Kevin J. Barker.","author":"Li Ang","year":"2019"},{"key":"e_1_3_2_2_39_1","volume-title":"2018 IEEE International Symposium on Workload Characterization (IISWC). 191--202","author":"Li Ang","year":"2018"},{"key":"e_1_3_2_2_40_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 1273--1278","author":"Li Ang","year":"2016"},{"key":"e_1_3_2_2_41_1","volume-title":"Proceedings of the International Symposium on Low Power Electronics and Design (ISLPED '18)","author":"Liu Zhenhong","year":"2018"},{"key":"e_1_3_2_2_42_1","volume-title":"Proceedings of the 27th ACM Symposium on Operating Systems Principles (Huntsville","author":"Mawhirter Daniel","year":"2019"},{"key":"e_1_3_2_2_43_1","volume-title":"2018 IEEE International Symposium on Workload Characterization (IISWC). IEEE, 122--133","author":"Mojumder Saiful A","year":"2018"},{"key":"e_1_3_2_2_44_1","unstructured":"Nvidia. 2019. Multi-GPU Programming Models. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2019\/presentation\/s9139-multi-gpu-programming-models.pdf  Nvidia. 2019. Multi-GPU Programming Models. https:\/\/developer.download.nvidia.com\/video\/gputechconf\/gtc\/2019\/presentation\/s9139-multi-gpu-programming-models.pdf"},{"key":"e_1_3_2_2_45_1","unstructured":"Nvidia. 2021. Optimized primitives for collective multi-GPU communication. https:\/\/github.com\/nvidia\/nccl  Nvidia. 2021. Optimized primitives for collective multi-GPU communication. https:\/\/github.com\/nvidia\/nccl"},{"key":"e_1_3_2_2_46_1","volume-title":"V100 white paper","author":"Tesla NVIDIA.","year":"2017"},{"key":"e_1_3_2_2_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2004.75"},{"key":"e_1_3_2_2_48_1","volume-title":"Using Machine Learning at Scale in HPC Simulations with SmartSim: An Application to Ocean Climate Modeling. arXiv preprint arXiv:2104.09355","author":"Partee Sam","year":"2021"},{"key":"e_1_3_2_2_49_1","volume-title":"David Fox, Jim A Gaffney, David Hysom, et al.","author":"Peterson J Luc","year":"2019"},{"key":"e_1_3_2_2_50_1","volume-title":"SC20: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1--16","author":"Rajbhandari Samyam","year":"2020"},{"key":"e_1_3_2_2_51_1","doi-asserted-by":"crossref","first-page":"128","DOI":"10.1109\/LCA.2019.2933842","article-title":"Speeding up Collective Communications Through Inter-GPU Re-routing","volume":"18","author":"Ranganath Kiran","year":"2019","journal-title":"IEEE Computer Architecture Letters"},{"key":"e_1_3_2_2_52_1","unstructured":"Baidu Research. 2017. Baidu All-Reduce. https:\/\/github.com\/baidu-research\/baidu-allreduce  Baidu Research. 2017. Baidu All-Reduce. https:\/\/github.com\/baidu-research\/baidu-allreduce"},{"key":"e_1_3_2_2_53_1","volume-title":"SC'14: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 513--524","author":"Sengupta Dipanjan","year":"2014"},{"key":"e_1_3_2_2_54_1","unstructured":"Noam Shazeer Youlong Cheng Niki Parmar Dustin Tran Ashish Vaswani Penporn Koanantakool Peter Hawkins HyoukJoong Lee Mingsheng Hong Cliff Young Ryan Sepassi and Blake Hechtman. 2018. Mesh-TensorFlow: Deep Learning for Supercomputers. In Neural Information Processing Systems.  Noam Shazeer Youlong Cheng Niki Parmar Dustin Tran Ashish Vaswani Penporn Koanantakool Peter Hawkins HyoukJoong Lee Mingsheng Hong Cliff Young Ryan Sepassi and Blake Hechtman. 2018. Mesh-TensorFlow: Deep Learning for Supercomputers. In Neural Information Processing Systems."},{"key":"e_1_3_2_2_55_1","volume-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition. In International Conference on Learning Representations.","author":"Simonyan Karen","year":"2015"},{"key":"e_1_3_2_2_56_1","volume-title":"2011 IEEE International Conference on Cluster Computing. IEEE, 262--271","author":"Song Shuaiwen","year":"2011"},{"key":"e_1_3_2_2_57_1","volume-title":"Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence","author":"Szegedy Christian"},{"key":"e_1_3_2_2_58_1","volume-title":"Going Deeper with Convolutions. CoRR abs\/1409.4842","author":"Szegedy Christian","year":"2014"},{"key":"e_1_3_2_2_59_1","volume-title":"Proceedings of the 28th ACM international conference on Supercomputing. 221--230","author":"Tallent Nathan R","year":"2014"},{"key":"e_1_3_2_2_60_1","volume-title":"2016 International Conference on Parallel Architecture and Compilation Techniques (PACT). IEEE, 3--15","author":"Tan Jingweijia","year":"2016"},{"key":"e_1_3_2_2_61_1","volume-title":"Proceedings of the 25th Symposium on Operating Systems Principles","author":"Teixeira Carlos H. C.","year":"2015"},{"key":"e_1_3_2_2_62_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3451164","article-title":"Paver: Locality graph-based thread block scheduling for gpus","volume":"18","author":"Tripathy Devashree","year":"2021","journal-title":"ACM Transactions on Architecture and Code Optimization (TACO)"},{"key":"e_1_3_2_2_63_1","volume-title":"Proceedings of the 15th IEEE\/ACM International Conference on Networking, Architecture, and Storage (2021 (To appear)).","author":"Tripathy Devashree","year":"2021"},{"key":"e_1_3_2_2_64_1","volume-title":"Proceedings of the ACM\/IEEE International Symposium on Low Power Electronics and Design. 109--114","author":"Tripathy Devashree","year":"2020"},{"key":"e_1_3_2_2_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/321921.321925"},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/1671970.1921702"},{"key":"e_1_3_2_2_67_1","volume-title":"Blink: Fast and generic collectives for distributed ml. arXiv preprint arXiv:1910.04940","author":"Wang Guanhua","year":"2019"},{"key":"e_1_3_2_2_68_1","volume-title":"Proceedings of the AAAI Conference on Artificial Intelligence","volume":"33","author":"Wang Lijing","year":"2019"},{"key":"e_1_3_2_2_69_1","unstructured":"Emma White. 2019. Optimizing deep learning on P3 and P3dn with EFA. https:\/\/aws.amazon.com\/blogs\/compute\/optimizing-deep-learning-on-p3-and-p3dn-with-efa\/  Emma White. 2019. Optimizing deep learning on P3 and P3dn with EFA. https:\/\/aws.amazon.com\/blogs\/compute\/optimizing-deep-learning-on-p3-and-p3dn-with-efa\/"},{"key":"e_1_3_2_2_70_1","volume-title":"International Conference on High Performance Computing. Springer, 123--143","author":"Wilke Jeremiah J","year":"2018"},{"key":"e_1_3_2_2_71_1","volume-title":"2016 IEEE International Symposium on High Performance Computer Architecture (HPCA). 176--187","author":"Wong Daniel","year":"2016"},{"key":"e_1_3_2_2_72_1","volume-title":"Gandiva: Introspective cluster scheduling for deep learning. In 13th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 18). 595--610.","author":"Xiao Wencong","year":"2018"},{"key":"e_1_3_2_2_73_1","volume-title":"Modular Routing Design for Chiplet-Based Systems. In 2018 ACM\/IEEE 45th Annual International Symposium on Computer Architecture (ISCA). 726--738","author":"Yin J.","year":"2018"},{"key":"e_1_3_2_2_74_1","volume-title":"Proceedings of the ACM International Conference on Supercomputing. 308--318","author":"Zamani Hadi","year":"2019"},{"key":"e_1_3_2_2_75_1","volume-title":"Proceedings of the ACM\/IEEE International Symposium on Low Power Electronics and Design. 205--210","author":"Zamani Hadi","year":"2020"}],"event":{"name":"SC '21: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis Missouri","acronym":"SC '21","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3480853","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3480853","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3458817.3480853","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:22Z","timestamp":1750191142000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3458817.3480853"}},"subtitle":["multi-accelerator pattern allocation policy for multi-tenant GPU servers"],"short-title":[],"issued":{"date-parts":[[2021,11,13]]},"references-count":74,"alternative-id":["10.1145\/3458817.3480853","10.1145\/3458817"],"URL":"https:\/\/doi.org\/10.1145\/3458817.3480853","relation":{},"subject":[],"published":{"date-parts":[[2021,11,13]]}}}