{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,4]],"date-time":"2026-05-04T23:28:44Z","timestamp":1777937324559,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2017,11,12]],"date-time":"2017-11-12T00:00:00Z","timestamp":1510444800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1337393"],"award-info":[{"award-number":["1337393"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2017,11,12]]},"DOI":"10.1145\/3126908.3126950","type":"proceedings-article","created":{"date-parts":[[2017,11,8]],"date-time":"2017-11-08T21:02:30Z","timestamp":1510174950000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":17,"title":["GPU triggered networking for intra-kernel communications"],"prefix":"10.1145","author":[{"given":"Michael","family":"LeBeane","sequence":"first","affiliation":[{"name":"The University of Texas at Austin"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Khaled","family":"Hamidouche","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Brad","family":"Benton","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc."}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mauricio","family":"Breternitz","sequence":"additional","affiliation":[{"name":"Instituto Universitario de Lisboa"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Steven K.","family":"Reinhardt","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lizy K.","family":"John","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2017,11,12]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amit Agarwal Eldar Akchurin Chris Basoglu Guoguo Chen Scott Cyphers Jasha Droppo Adam Eversole Brian Guenter Mark Hillebrand T. Ryan Hoens Xuedong Huang Zhiheng Huang Vladimir Ivanov Alexey Kamenev Philipp Kranen Oleksii Kuchaiev Wolfgang Manousek Avner May Bhaskar Mitra Olivier Nano Gaizka Navarro Alexey Orlov Hari Parthasarathi Baolin Peng Marko Radmilac Alexey Reznichenko Frank Seide Michael L. Seltzer Malcolm Slaney Andreas Stolcke Huaming Wang Yongqiang Wang Kaisheng Yao Dong Yu Yu Zhang and Geoffrey Zweig. 2014. An Introduction to Computational Networks and the Computational Network Toolkit. Technical Report. Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2014\/08\/CNTKBook-20160217.pdf  Amit Agarwal Eldar Akchurin Chris Basoglu Guoguo Chen Scott Cyphers Jasha Droppo Adam Eversole Brian Guenter Mark Hillebrand T. Ryan Hoens Xuedong Huang Zhiheng Huang Vladimir Ivanov Alexey Kamenev Philipp Kranen Oleksii Kuchaiev Wolfgang Manousek Avner May Bhaskar Mitra Olivier Nano Gaizka Navarro Alexey Orlov Hari Parthasarathi Baolin Peng Marko Radmilac Alexey Reznichenko Frank Seide Michael L. Seltzer Malcolm Slaney Andreas Stolcke Huaming Wang Yongqiang Wang Kaisheng Yao Dong Yu Yu Zhang and Geoffrey Zweig. 2014. An Introduction to Computational Networks and the Computational Network Toolkit. Technical Report. Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2014\/08\/CNTKBook-20160217.pdf"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2017.29"},{"key":"e_1_3_2_1_3_1","unstructured":"Amazon. 2016. Amazon EC2 Cloud Computing. https:\/\/aws.amazon.com\/ec2  Amazon. 2016. Amazon EC2 Cloud Computing. https:\/\/aws.amazon.com\/ec2"},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2015. The AMD gem5 APU Simulator: Modeling Heterogeneous Systems in gem5. http:\/\/gem5.org\/GPU_Models  AMD. 2015. The AMD gem5 APU Simulator: Modeling Heterogeneous Systems in gem5. http:\/\/gem5.org\/GPU_Models"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_2_1_6_1","volume-title":"BXI: Bull eXascale Interconnect. https:\/\/atos.net\/en\/products\/high-performance-computing-hpc\/bxi-bull-exascale-interconnect","year":"2017","unstructured":"Bull. 2017 . BXI: Bull eXascale Interconnect. https:\/\/atos.net\/en\/products\/high-performance-computing-hpc\/bxi-bull-exascale-interconnect Bull. 2017. BXI: Bull eXascale Interconnect. https:\/\/atos.net\/en\/products\/high-performance-computing-hpc\/bxi-bull-exascale-interconnect"},{"key":"e_1_3_2_1_7_1","volume-title":"CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid).","author":"Chu Ching-Hsiang","unstructured":"Ching-Hsiang Chu , Khaled Hamidouche , Akshay Venkatesh , Ammar Ahmad Awan , and Dhabaleswar K. Panda . 2016 . CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid). Ching-Hsiang Chu, Khaled Hamidouche, Akshay Venkatesh, Ammar Ahmad Awan, and Dhabaleswar K. Panda. 2016. CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid)."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931088.2931091"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000108"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2008.01131.x"},{"key":"e_1_3_2_1_11_1","volume-title":"Workshop on Exascale MPI (ExaMPI).","author":"Grant Ryan E","year":"2015","unstructured":"Ryan E Grant , Anthony Skjellum , and V Purushotham . 2015 . Lightweight threading with MPI using Persistent Communications Semantics . In Workshop on Exascale MPI (ExaMPI). Ryan E Grant, Anthony Skjellum, and V Purushotham. 2015. Lightweight threading with MPI using Persistent Communications Semantics. In Workshop on Exascale MPI (ExaMPI)."},{"key":"e_1_3_2_1_12_1","unstructured":"Khronos Group. 2017. OpenCL. https:\/\/www.khronos.org\/opencl\/  Khronos Group. 2017. OpenCL. https:\/\/www.khronos.org\/opencl\/"},{"key":"e_1_3_2_1_13_1","volume-title":"Intl. Conf. for High Performance Computing, Networking, Storage and Analysis (SC) (SC '16)","author":"Jeremia B\u00e4r Tobias Gysi","year":"2016","unstructured":"Tobias Gysi Jeremia B\u00e4r , and Torsten Hoefler . 2016 . dCUDA: Hardware Supported Overlap of Computation and Communication . In Intl. Conf. for High Performance Computing, Networking, Storage and Analysis (SC) (SC '16) . Article 52, 12 pages. Tobias Gysi Jeremia B\u00e4r, and Torsten Hoefler. 2016. dCUDA: Hardware Supported Overlap of Computation and Communication. In Intl. Conf. for High Performance Computing, Networking, Storage and Analysis (SC) (SC '16). Article 52, 12 pages."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2016.05.003"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"issue":"0","key":"e_1_3_2_1_17_1","first-page":"2","article-title":"InfiniBand Architecture Specification","volume":"1","author":"InfiniBand Trade Association","year":"2000","unstructured":"InfiniBand Trade Association . 2000 . InfiniBand Architecture Specification : Release 1 . 0 . 2 . http:\/\/www.infinibandta.org\/content\/pages.php?pg=technology_download InfiniBand Trade Association. 2000. InfiniBand Architecture Specification: Release 1.0.2. http:\/\/www.infinibandta.org\/content\/pages.php?pg=technology_download","journal-title":"Release"},{"key":"e_1_3_2_1_18_1","unstructured":"InfiniBand Trade Association. 2014. RDMA over Converged Ethernet v2. https:\/\/cw.infinibandta.org\/document\/dl\/7781  InfiniBand Trade Association. 2014. RDMA over Converged Ethernet v2. https:\/\/cw.infinibandta.org\/document\/dl\/7781"},{"key":"e_1_3_2_1_19_1","unstructured":"Intel. 2010. Internet Wide Area RDMA Protocol (iWARP). http:\/\/www.intel.com\/content\/dam\/doc\/technology-brief\/iwarp-brief.pdf  Intel. 2010. Internet Wide Area RDMA Protocol (iWARP). http:\/\/www.intel.com\/content\/dam\/doc\/technology-brief\/iwarp-brief.pdf"},{"key":"e_1_3_2_1_20_1","unstructured":"Intel. 2015. Omni-Path Fabric 100 Series. https:\/\/fabricbuilders.intel.com\/  Intel. 2015. Omni-Path Fabric 100 Series. https:\/\/fabricbuilders.intel.com\/"},{"key":"e_1_3_2_1_21_1","volume-title":"GPUnet: Networking Abstractions for GPU Programs. In USENLX Conf. on Operating Systems Design and Implementation (OSDI). 201--216","author":"Kim Sangman","year":"2014","unstructured":"Sangman Kim , Seonggu Huh , Yige Hu , Xinya Zhang , Emmett Witchel , Amir Wated , and Mark Silberstein . 2014 . GPUnet: Networking Abstractions for GPU Programs. In USENLX Conf. on Operating Systems Design and Implementation (OSDI). 201--216 . Sangman Kim, Seonggu Huh, Yige Hu, Xinya Zhang, Emmett Witchel, Amir Wated, and Mark Silberstein. 2014. GPUnet: Networking Abstractions for GPU Programs. In USENLX Conf. on Operating Systems Design and Implementation (OSDI). 201--216."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2014.61"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2015.7095817"},{"key":"e_1_3_2_1_24_1","unstructured":"Jim Lambers. 2010. Jacobi Methods. http:\/\/web.stanford.edu\/class\/cme335\/lecture7.pdf  Jim Lambers. 2010. Jacobi Methods. http:\/\/web.stanford.edu\/class\/cme335\/lecture7.pdf"},{"key":"e_1_3_2_1_25_1","unstructured":"Mellanox. 2017. Mellanox OFED GPUDirect RDMA. http:\/\/www.mellanox.com\/page\/products_dyn?product_family=116  Mellanox. 2017. Mellanox OFED GPUDirect RDMA. http:\/\/www.mellanox.com\/page\/products_dyn?product_family=116"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2159430.2159433"},{"key":"e_1_3_2_1_27_1","volume-title":"MPI: A Message-Passing Interface Standard. Ver. 3. www.mpi-forum.org\/docs\/mpi-3.0\/mpi30-report.pdf","author":"Forum MPI","year":"2012","unstructured":"MPI Forum . 2012 . MPI: A Message-Passing Interface Standard. Ver. 3. www.mpi-forum.org\/docs\/mpi-3.0\/mpi30-report.pdf MPI Forum. 2012. MPI: A Message-Passing Interface Standard. Ver. 3. www.mpi-forum.org\/docs\/mpi-3.0\/mpi30-report.pdf"},{"key":"e_1_3_2_1_28_1","unstructured":"Nvidia. 2016. CUDA Toolkit 8.0. https:\/\/developer.nvidia.com\/cuda-toolkit  Nvidia. 2016. CUDA Toolkit 8.0. https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"e_1_3_2_1_29_1","unstructured":"Nvidia. 2016. Fast Multi-GPU collectives with NCCL. https:\/\/devblogs.nvidia.com\/parallelforall\/fast-multi-gpu-collectives-nccl\/  Nvidia. 2016. Fast Multi-GPU collectives with NCCL. https:\/\/devblogs.nvidia.com\/parallelforall\/fast-multi-gpu-collectives-nccl\/"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2013.6702638"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2014.111"},{"key":"e_1_3_2_1_32_1","unstructured":"Sreeram Potluri Nathan Luehr and Nikolay Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. http:\/\/on-demand-gtc.gputechconf.com\/gtc-quicklink\/7D7mU  Sreeram Potluri Nathan Luehr and Nikolay Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. http:\/\/on-demand-gtc.gputechconf.com\/gtc-quicklink\/7D7mU"},{"key":"e_1_3_2_1_33_1","unstructured":"Davide Rossetti. 2015. GPUDirect Async. http:\/\/on-demand.gputechconf.com\/gtc\/2015\/presentation\/S5412-Davide-Rossetti.pdf  Davide Rossetti. 2015. GPUDirect Async. http:\/\/on-demand.gputechconf.com\/gtc\/2015\/presentation\/S5412-Davide-Rossetti.pdf"},{"key":"e_1_3_2_1_34_1","unstructured":"Sandia National Laboratories. 2014. The Portals 4.0.2 Network Programming Interface. http:\/\/www.cs.sandia.gov\/Portals\/portals402.pdf  Sandia National Laboratories. 2014. The Portals 4.0.2 Network Programming Interface. http:\/\/www.cs.sandia.gov\/Portals\/portals402.pdf"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.5555\/2386173.2386183"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161065"},{"key":"e_1_3_2_1_37_1","unstructured":"TACC. 2015. Stampede Supercomputer User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/stampede  TACC. 2015. Stampede Supercomputer User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/stampede"},{"key":"e_1_3_2_1_38_1","unstructured":"TOP500.org. 2016. Green 500. http:\/\/www.top500.org\/green500  TOP500.org. 2016. Green 500. http:\/\/www.top500.org\/green500"},{"key":"e_1_3_2_1_39_1","unstructured":"TOP500.org. 2017. Highlights - June 2017. https:\/\/www.top500.org\/lists\/2017\/06\/highlights\/  TOP500.org. 2017. Highlights - June 2017. https:\/\/www.top500.org\/lists\/2017\/06\/highlights\/"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI.2011.15"}],"event":{"name":"SC '17: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"Denver Colorado","acronym":"SC '17","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","IEEE CS"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3126908.3126950","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3126908.3126950","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3126908.3126950","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T02:11:08Z","timestamp":1750212668000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3126908.3126950"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,11,12]]},"references-count":39,"alternative-id":["10.1145\/3126908.3126950","10.1145\/3126908"],"URL":"https:\/\/doi.org\/10.1145\/3126908.3126950","relation":{},"subject":[],"published":{"date-parts":[[2017,11,12]]},"assertion":[{"value":"2017-11-12","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}