{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:34:33Z","timestamp":1750221273036,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":38,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,11,12]],"date-time":"2017-11-12T00:00:00Z","timestamp":1510444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,11,12]]},"DOI":"10.1145\/3126908.3126914","type":"proceedings-article","created":{"date-parts":[[2017,11,8]],"date-time":"2017-11-08T21:02:30Z","timestamp":1510174950000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Gravel"],"prefix":"10.1145","author":[{"given":"Marc S.","family":"Orr","sequence":"first","affiliation":[{"name":"UW-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuai","family":"Che","sequence":"additional","affiliation":[{"name":"AMD Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Bradford M.","family":"Beckmann","sequence":"additional","affiliation":[{"name":"AMD Research"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mark","family":"Oskin","sequence":"additional","affiliation":[{"name":"University of Washington"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Steven K.","family":"Reinhardt","sequence":"additional","affiliation":[{"name":"Microsoft"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"David A.","family":"Wood","sequence":"additional","affiliation":[{"name":"AMD Research, UW-Madison"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2017,11,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"The Green500 List. {Online}. http:\/\/www.green500.org\/  The Green500 List. {Online}. http:\/\/www.green500.org\/"},{"key":"e_1_3_2_1_2_1","unstructured":"Amazon Elastic Compute Cloud User Guide for Linux Instances. {Online}. http:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/using_cluster_computing.html.  Amazon Elastic Compute Cloud User Guide for Linux Instances. {Online}. http:\/\/docs.aws.amazon.com\/AWSEC2\/latest\/UserGuide\/using_cluster_computing.html."},{"key":"e_1_3_2_1_3_1","unstructured":"Microsoft Azure N Series GPU enabled Virtual Machines. {Online}. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/virtual-machines\/series\/#n-series.  Microsoft Azure N Series GPU enabled Virtual Machines. {Online}. https:\/\/azure.microsoft.com\/en-us\/pricing\/details\/virtual-machines\/series\/#n-series."},{"key":"e_1_3_2_1_4_1","unstructured":"Google Cloud Platform: GRAPHICS PROCESSING UNIT (GPU) Leverage GPUs on Google Cloud for machine learning and scientific computing. {Online}. https:\/\/cloud.google.com\/gpu\/.  Google Cloud Platform: GRAPHICS PROCESSING UNIT (GPU) Leverage GPUs on Google Cloud for machine learning and scientific computing. {Online}. https:\/\/cloud.google.com\/gpu\/."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1978542.1978549"},{"key":"e_1_3_2_1_6_1","unstructured":"M. Abadi et al. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems. Google preliminary whitepaper.  M. Abadi et al. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems. Google preliminary whitepaper."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"D. Yu K. Yao and Y. Zhang. 2015. The Computational Network Toolkit {Best of the Web}. In IEEE Signal Processing Magazine.  D. Yu K. Yao and Y. Zhang. 2015. The Computational Network Toolkit {Best of the Web}. In IEEE Signal Processing Magazine.","DOI":"10.1109\/MSP.2015.2462371"},{"volume-title":"BigLearn NIPS Workshop.","author":"Collobert R.","key":"e_1_3_2_1_8_1","unstructured":"R. Collobert , K. Kavukcuoglu , and C. Farabet . 2011. Torch7: A Matlab-Like Environment for Machine Learning . In BigLearn NIPS Workshop. R. Collobert, K. Kavukcuoglu, and C. Farabet. 2011. Torch7: A Matlab-Like Environment for Machine Learning. In BigLearn NIPS Workshop."},{"key":"e_1_3_2_1_9_1","unstructured":"Install GraphLab Create with GPU Acceleration. {Online}. https:\/\/dato.com\/download\/install-graphlab-create-gpu.html\/.  Install GraphLab Create with GPU Acceleration. {Online}. https:\/\/dato.com\/download\/install-graphlab-create-gpu.html\/."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1807167.1807184"},{"volume-title":"Proc. of the USENIX Annual Technical Conference (ATC).","author":"Nelson J.","key":"e_1_3_2_1_11_1","unstructured":"J. Nelson , B. Holt , B. Myers , P. Briggs , L. Ceze , S. Kahan , and M. Oskin . 2015. Latency-tolerant Software Distributed Shared Memory . In Proc. of the USENIX Annual Technical Conference (ATC). J. Nelson, B. Holt, B. Myers, P. Briggs, L. Ceze, S. Kahan, and M. Oskin. 2015. Latency-tolerant Software Distributed Shared Memory. In Proc. of the USENIX Annual Technical Conference (ATC)."},{"volume-title":"Proc. of the USENIX Conference on Operating Systems Design and Implementation (OSDI).","author":"Gonzalez J.","key":"e_1_3_2_1_12_1","unstructured":"J. Gonzalez , Y. Low , H. Gu , D. Bickson , and C. Guestrin . 2012. PowerGraph: Distributed Graph-Parallel Computation on Natural Graphs . In Proc. of the USENIX Conference on Operating Systems Design and Implementation (OSDI). J. Gonzalez, Y. Low, H. Gu, D. Bickson, and C. Guestrin. 2012. PowerGraph: Distributed Graph-Parallel Computation on Natural Graphs. In Proc. of the USENIX Conference on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2013.17"},{"volume-title":"Proc. of the International Supercomputing Conference (ISC).","author":"Wang H.","key":"e_1_3_2_1_14_1","unstructured":"H. Wang , S. Potluri , M. Luo , A. Singh , S. Sur , and D. Panda . 2011. MVAPICH2-GPU: Optimized GPU to GPU Communication for InfiniBand Clusters . In Proc. of the International Supercomputing Conference (ISC). H. Wang, S. Potluri, M. Luo, A. Singh, S. Sur, and D. Panda. 2011. MVAPICH2-GPU: Optimized GPU to GPU Communication for InfiniBand Clusters. In Proc. of the International Supercomputing Conference (ISC)."},{"volume-title":"Proc. of the IEEE International Conference Cluster Computing (Cluster).","author":"Oden L.","key":"e_1_3_2_1_15_1","unstructured":"L. Oden and H. Fr\u00f6ning . 2013. GGAS: Global GPU Address Spaces for Efficient Communication in Heterogeneous Clusters . In Proc. of the IEEE International Conference Cluster Computing (Cluster). L. Oden and H. Fr\u00f6ning. 2013. GGAS: Global GPU Address Spaces for Efficient Communication in Heterogeneous Clusters. In Proc. of the IEEE International Conference Cluster Computing (Cluster)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161065"},{"key":"e_1_3_2_1_17_1","unstructured":"S. Potluri N. Luehr and N. Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. {Online}. http:\/\/on-demand.gputechconf.com\/gtc\/2016\/presentation\/s6378-nathan-luehr-simplyfing-multi-gpu-communication-nvshmem.pdf.  S. Potluri N. Luehr and N. Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. {Online}. http:\/\/on-demand.gputechconf.com\/gtc\/2016\/presentation\/s6378-nathan-luehr-simplyfing-multi-gpu-communication-nvshmem.pdf."},{"volume-title":"Proc. of the USENIX Symp. on Operating Systems Design and Implementation (OSDI).","author":"Kim S.","key":"e_1_3_2_1_18_1","unstructured":"S. Kim , S. Huh , Y. Hu , X. Zhang , E. Witchel , A. Wated , and M. Silberstein . 2014. GPUnet: Networking Abstractions for GPU Programs . In Proc. of the USENIX Symp. on Operating Systems Design and Implementation (OSDI). S. Kim, S. Huh, Y. Hu, X. Zhang, E. Witchel, A. Wated, and M. Silberstein. 2014. GPUnet: Networking Abstractions for GPU Programs. In Proc. of the USENIX Symp. on Operating Systems Design and Implementation (OSDI)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931088.2931091"},{"key":"e_1_3_2_1_20_1","unstructured":"OpenCL 2.0 Reference Pages. {Online}. http:\/\/www.khronos.org\/registry\/cl\/sdk\/2.0\/docs\/man\/xhtml\/.  OpenCL 2.0 Reference Pages. {Online}. http:\/\/www.khronos.org\/registry\/cl\/sdk\/2.0\/docs\/man\/xhtml\/."},{"key":"e_1_3_2_1_21_1","unstructured":"CUDA C Programming Guide. {Online}. http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/.  CUDA C Programming Guide. {Online}. http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/."},{"key":"e_1_3_2_1_22_1","unstructured":"HSA Foundation. 2015. HSA Programmer's Reference Manual: HSAIL Virtual ISA and Programming Model Compiler Writer's Guide and Object Format (BRIG) Version 1.0.1.  HSA Foundation. 2015. HSA Programmer's Reference Manual: HSAIL Virtual ISA and Programming Model Compiler Writer's Guide and Object Format (BRIG) Version 1.0.1."},{"key":"e_1_3_2_1_23_1","unstructured":"S. Junkins. 2016. The Compute Architecture of Intel\u00ae Processor Graphics Gen9. Intel whitepaper v1.0.  S. Junkins. 2016. The Compute Architecture of Intel\u00ae Processor Graphics Gen9. Intel whitepaper v1.0."},{"key":"e_1_3_2_1_24_1","unstructured":"HPC Challenge Benchmark: RandomAccess. {Online}. http:\/\/icl.cs.utk.edu\/projectsfiles\/hpcc\/RandomAccess\/.  HPC Challenge Benchmark: RandomAccess. {Online}. http:\/\/icl.cs.utk.edu\/projectsfiles\/hpcc\/RandomAccess\/."},{"key":"e_1_3_2_1_25_1","unstructured":"Wikipedia. Counting Sort. {Online}. https:\/\/en.wikipedia.org\/wiki\/Counting_sort.  Wikipedia. Counting Sort. {Online}. https:\/\/en.wikipedia.org\/wiki\/Counting_sort."},{"volume-title":"Proc. of the International Symp. on Computer Architecture (ISCA).","author":"Orr M.","key":"e_1_3_2_1_26_1","unstructured":"M. Orr , B. Beckmann , S. Reinhardt , and D. Wood . 2014. Fine-Grain Task Aggregation and Coordination on GPUs . In Proc. of the International Symp. on Computer Architecture (ISCA). M. Orr, B. Beckmann, S. Reinhardt, and D. Wood. 2014. Fine-Grain Task Aggregation and Coordination on GPUs. In Proc. of the International Symp. on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_27_1","unstructured":"H. Levy. 2003. Single Producer Consumer on a Bounded Array Problem. Course notes. {Online}. https:\/\/courses.cs.washington.edu\/courses\/cse451\/03wi\/section\/prodcons.htm.  H. Levy. 2003. Single Producer Consumer on a Bounded Array Problem. Course notes. {Online}. https:\/\/courses.cs.washington.edu\/courses\/cse451\/03wi\/section\/prodcons.htm."},{"volume-title":"Proc. of the International Symp. on High Performance Computer Architecture (HPCA).","author":"Fung W.","key":"e_1_3_2_1_28_1","unstructured":"W. Fung and T. Aamodt . 2011. Thread Block Compaction for Efficient SIMT Control Flow . In Proc. of the International Symp. on High Performance Computer Architecture (HPCA). W. Fung and T. Aamodt. 2011. Thread Block Compaction for Efficient SIMT Control Flow. In Proc. of the International Symp. on High Performance Computer Architecture (HPCA)."},{"key":"e_1_3_2_1_29_1","unstructured":"University of Florida Sparse Matrix Collection. {Online}. http:\/\/www.cise.ufl.edu\/research\/sparse\/matrices\/.  University of Florida Sparse Matrix Collection. {Online}. http:\/\/www.cise.ufl.edu\/research\/sparse\/matrices\/."},{"key":"e_1_3_2_1_30_1","unstructured":"NERSC. Meraculous Data. {Online}. http:\/\/portal.nersc.gov\/project\/m888\/apex\/Meraculous_data\/.  NERSC. Meraculous Data. {Online}. http:\/\/portal.nersc.gov\/project\/m888\/apex\/Meraculous_data\/."},{"key":"e_1_3_2_1_31_1","unstructured":"OpenMPI FAQ. {Online}. https:\/\/www.open-mpi.org\/faq\/?category=supported-systems#thread-support.  OpenMPI FAQ. {Online}. https:\/\/www.open-mpi.org\/faq\/?category=supported-systems#thread-support."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC.2014.7040962"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.41"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451169"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750393"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/7902.7903"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2014.117"},{"key":"e_1_3_2_1_38_1","unstructured":"CCIX Consortium. Cache Coherent Interconnect for Accelerators (CCIX). {Online}. http:\/\/www.ccixconsortium.com  CCIX Consortium. Cache Coherent Interconnect for Accelerators (CCIX). {Online}. http:\/\/www.ccixconsortium.com"}],"event":{"name":"SC '17: The International Conference for High Performance Computing, Networking, Storage and Analysis","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"],"location":"Denver Colorado","acronym":"SC '17"},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3126908.3126914","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3126908.3126914","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:11:08Z","timestamp":1750212668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3126908.3126914"}},"subtitle":["fine-grain GPU-initiated network messages"],"short-title":[],"issued":{"date-parts":[[2017,11,12]]},"references-count":38,"alternative-id":["10.1145\/3126908.3126914","10.1145\/3126908"],"URL":"https:\/\/doi.org\/10.1145\/3126908.3126914","relation":{},"subject":[],"published":{"date-parts":[[2017,11,12]]},"assertion":[{"value":"2017-11-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}