{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,7,1]],"date-time":"2025-07-01T12:48:50Z","timestamp":1751374130733,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":39,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,11,1]],"date-time":"2018-11-01T00:00:00Z","timestamp":1541030400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1745813 and 1725743"],"award-info":[{"award-number":["1745813 and 1725743"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100005856","name":"Faculdade de Ci\u00eancias e Tecnologia, Universidade Nova de Lisboa","doi-asserted-by":"publisher","award":["UID\/CEC\/50021\/2013"],"award-info":[{"award-number":["UID\/CEC\/50021\/2013"]}],"id":[{"id":"10.13039\/501100005856","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,11]]},"DOI":"10.1145\/3243176.3243179","type":"proceedings-article","created":{"date-parts":[[2018,10,10]],"date-time":"2018-10-10T13:32:32Z","timestamp":1539178352000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["ComP-net"],"prefix":"10.1145","author":[{"given":"Michael","family":"LeBeane","sequence":"first","affiliation":[{"name":"The University of Texas at Austin"}]},{"given":"Khaled","family":"Hamidouche","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc."}]},{"given":"Brad","family":"Benton","sequence":"additional","affiliation":[{"name":"Advanced Micro Devices, Inc."}]},{"given":"Mauricio","family":"Breternitz","sequence":"additional","affiliation":[{"name":"University of Lisbon"}]},{"given":"Steven K.","family":"Reinhardt","sequence":"additional","affiliation":[{"name":"Microsoft Corporation"}]},{"given":"Lizy K.","family":"John","sequence":"additional","affiliation":[{"name":"The University of Texas at Austin"}]}],"member":"320","published-online":{"date-parts":[[2018,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Amit Agarwal Eldar Akchurin Chris Basoglu Guoguo Chen Scott Cyphers Jasha Droppo Adam Eversole Brian Guenter Mark Hillebrand T. Ryan Hoens Xuedong Huang Zhiheng Huang Vladimir Ivanov Alexey Kamenev Philipp Kranen Oleksii Kuchaiev Wolfgang Manousek Avner May Bhaskar Mitra Olivier Nano Gaizka Navarro Alexey Orlov Hari Parthasarathi Baolin Peng Marko Radmilac Alexey Reznichenko Frank Seide Michael L. Seltzer Malcolm Slaney Andreas Stolcke Huaming Wang Yongqiang Wang Kaisheng Yao Dong Yu Yu Zhang and Geoffrey Zweig. 2014. An Introduction to Computational Networks and the Computational Network Toolkit. Technical Report. Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2014\/08\/CNTKBook-20160217.pdf  Amit Agarwal Eldar Akchurin Chris Basoglu Guoguo Chen Scott Cyphers Jasha Droppo Adam Eversole Brian Guenter Mark Hillebrand T. Ryan Hoens Xuedong Huang Zhiheng Huang Vladimir Ivanov Alexey Kamenev Philipp Kranen Oleksii Kuchaiev Wolfgang Manousek Avner May Bhaskar Mitra Olivier Nano Gaizka Navarro Alexey Orlov Hari Parthasarathi Baolin Peng Marko Radmilac Alexey Reznichenko Frank Seide Michael L. Seltzer Malcolm Slaney Andreas Stolcke Huaming Wang Yongqiang Wang Kaisheng Yao Dong Yu Yu Zhang and Geoffrey Zweig. 2014. An Introduction to Computational Networks and the Computational Network Toolkit . Technical Report. Microsoft. https:\/\/www.microsoft.com\/en-us\/research\/wp-content\/uploads\/2014\/08\/CNTKBook-20160217.pdf"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGRID.2017.29"},{"key":"e_1_3_2_1_3_1","unstructured":"AMD. 2015. The AMD gem5 APU Simulator: Modeling Heterogeneous Systems in gem5. http:\/\/gem5.org\/GPU_Models  AMD. 2015. The AMD gem5 APU Simulator: Modeling Heterogeneous Systems in gem5. http:\/\/gem5.org\/GPU_Models"},{"key":"e_1_3_2_1_4_1","unstructured":"AMD. 2017. Graphics Core Next Architecture Generation 3 ISA. http:\/\/gpuopen.com\/compute-product\/amd-gcn3-isa-architecture-manual\/  AMD. 2017. Graphics Core Next Architecture Generation 3 ISA. http:\/\/gpuopen.com\/compute-product\/amd-gcn3-isa-architecture-manual\/"},{"key":"e_1_3_2_1_5_1","volume-title":"HIP: Heterogeneous-computing Interface for Portability","author":"AMD.","year":"2018","unstructured":"AMD. 2018 . HIP: Heterogeneous-computing Interface for Portability . http:\/\/rocm-developer-tools.github.io\/HIP\/ AMD. 2018. HIP: Heterogeneous-computing Interface for Portability. http:\/\/rocm-developer-tools.github.io\/HIP\/"},{"key":"e_1_3_2_1_6_1","unstructured":"Baidu. 2018. baidu-allreduce. https:\/\/github.com\/baidu-research\/baidu-allreduce  Baidu. 2018. baidu-allreduce. https:\/\/github.com\/baidu-research\/baidu-allreduce"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Matthew Baker Swen Boehm Aurelien Bouteiller Barbara Chapman Robert Cernohous James Culhane Tony Curtis James Dinan Mike Dubman Karl Feind Manjunath Gorentla Venkata Max Grossman Khaled Hamidouche Jeff Hammond Yossi Itigin Bryant Lam David Knaak Jeff Kuehn Jens Manser Tiffany M. Mintz David Ozog Nicholas Park Steve Poole Wendy Poole Swaroop Pophale Sreeram Potluri Howard Pritchard Naveen Ravichandrasekaran Michael Raymond James Ross Pavel Shamis Sameer Shende and Lauren Smith. 2018. OpenSHMEM Specification. http:\/\/openshmem.org\/site\/sites\/default\/site_files\/OpenSHMEM-1.4.pdf  Matthew Baker Swen Boehm Aurelien Bouteiller Barbara Chapman Robert Cernohous James Culhane Tony Curtis James Dinan Mike Dubman Karl Feind Manjunath Gorentla Venkata Max Grossman Khaled Hamidouche Jeff Hammond Yossi Itigin Bryant Lam David Knaak Jeff Kuehn Jens Manser Tiffany M. Mintz David Ozog Nicholas Park Steve Poole Wendy Poole Swaroop Pophale Sreeram Potluri Howard Pritchard Naveen Ravichandrasekaran Michael Raymond James Ross Pavel Shamis Sameer Shende and Lauren Smith. 2018. OpenSHMEM Specification. http:\/\/openshmem.org\/site\/sites\/default\/site_files\/OpenSHMEM-1.4.pdf","DOI":"10.2172\/1460190"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024716.2024718"},{"volume-title":"CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid).","author":"Chu Ching-Hsiang","key":"e_1_3_2_1_9_1","unstructured":"Ching-Hsiang Chu , Khaled Hamidouche , Akshay Venkatesh , Ammar Ahmad Awan , and Dhabaleswar K. Panda . 2016 . CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid). Ching-Hsiang Chu, Khaled Hamidouche, Akshay Venkatesh, Ammar Ahmad Awan, and Dhabaleswar K. Panda. 2016. CUDA Kernel Based Collective Reduction Operations on Large-scale GPU Clusters. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2931088.2931091"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2012.257"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3014904.3014974"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2016.05.003"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"issue":"0","key":"e_1_3_2_1_15_1","first-page":"2","article-title":"InfiniBand Architecture Specification","volume":"1","author":"InfiniBand Trade Association","year":"2000","unstructured":"InfiniBand Trade Association . 2000 . InfiniBand Architecture Specification : Release 1 . 0 . 2 . http:\/\/www.infinibandta.org\/content\/pages.php?pg=technology_download InfiniBand Trade Association. 2000. InfiniBand Architecture Specification: Release 1.0.2. http:\/\/www.infinibandta.org\/content\/pages.php?pg=technology_download","journal-title":"Release"},{"key":"e_1_3_2_1_16_1","volume-title":"GPUnet: Networking Abstractions for GPU Programs. In USENIX Conf. on Operating Systems Design and Implementation (OSDI). 201--216","author":"Kim Sangman","year":"2014","unstructured":"Sangman Kim , Seonggu Huh , Yige Hu , Xinya Zhang , Emmett Witchel , Amir Wated , and Mark Silberstein . 2014 . GPUnet: Networking Abstractions for GPU Programs. In USENIX Conf. on Operating Systems Design and Implementation (OSDI). 201--216 . Sangman Kim, Seonggu Huh, Yige Hu, Xinya Zhang, Emmett Witchel, Amir Wated, and Mark Silberstein. 2014. GPUnet: Networking Abstractions for GPU Programs. In USENIX Conf. on Operating Systems Design and Implementation (OSDI). 201--216."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICPPW.2014.61"},{"key":"e_1_3_2_1_18_1","volume-title":"Analyzing Communication Models for Distributed Thread-collaborative Processors in Terms of Energy and Time. In Intl. Symp. on Performance Analysis of Systems and Software (ISPASS).","author":"Klenk Benjamin","year":"2015","unstructured":"Benjamin Klenk , Lena Oden , and Holger Froning . 2015 . Analyzing Communication Models for Distributed Thread-collaborative Processors in Terms of Energy and Time. In Intl. Symp. on Performance Analysis of Systems and Software (ISPASS). Benjamin Klenk, Lena Oden, and Holger Froning. 2015. Analyzing Communication Models for Distributed Thread-collaborative Processors in Terms of Energy and Time. In Intl. Symp. on Performance Analysis of Systems and Software (ISPASS)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126950"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/1669112.1669172"},{"key":"e_1_3_2_1_21_1","unstructured":"Mellanox. 2017. Mellanox OFED GPUDirect RDMA. http:\/\/www.mellanox.com\/page\/products_dyn?product_family=116  Mellanox. 2017. Mellanox OFED GPUDirect RDMA. http:\/\/www.mellanox.com\/page\/products_dyn?product_family=116"},{"key":"e_1_3_2_1_22_1","unstructured":"Mellanox. 2018. InfiniBand Performance. http:\/\/www.mellanox.com\/page\/performance_infiniband  Mellanox. 2018. InfiniBand Performance. http:\/\/www.mellanox.com\/page\/performance_infiniband"},{"key":"e_1_3_2_1_23_1","unstructured":"Nvidia. 2016. Fast Multi-GPU collectives with NCCL. https:\/\/devblogs.nvidia.com\/parallelforall\/fast-multi-gpu-collectives-nccl\/  Nvidia. 2016. Fast Multi-GPU collectives with NCCL. https:\/\/devblogs.nvidia.com\/parallelforall\/fast-multi-gpu-collectives-nccl\/"},{"key":"e_1_3_2_1_24_1","unstructured":"Nvidia. 2017. GPU Applications. http:\/\/www.nvidia.com\/object\/gpu-applications-domain.html  Nvidia. 2017. GPU Applications. http:\/\/www.nvidia.com\/object\/gpu-applications-domain.html"},{"key":"e_1_3_2_1_25_1","unstructured":"Nvidia. 2018. CUDA Toolkit 9.2. https:\/\/developer.nvidia.com\/cuda-toolkit  Nvidia. 2018. CUDA Toolkit 9.2. https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"e_1_3_2_1_26_1","unstructured":"Nvidia. 2018. Nvidia DGX-2. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/  Nvidia. 2018. Nvidia DGX-2. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-2\/"},{"key":"e_1_3_2_1_27_1","volume-title":"GGAS: Global GPU Address Spaces for Efficient Communication in Heterogeneous Clusters. In Intl. Conf. on Cluster Computing (CLUSTER). 1--8.","author":"Oden Lena","year":"2013","unstructured":"Lena Oden and Holger Froning . 2013 . GGAS: Global GPU Address Spaces for Efficient Communication in Heterogeneous Clusters. In Intl. Conf. on Cluster Computing (CLUSTER). 1--8. Lena Oden and Holger Froning. 2013. GGAS: Global GPU Address Spaces for Efficient Communication in Heterogeneous Clusters. In Intl. Conf. on Cluster Computing (CLUSTER). 1--8."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2014.111"},{"volume-title":"Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid)","author":"Oden Lena","key":"e_1_3_2_1_29_1","unstructured":"Lena Oden , Benjamin Klenk , and Holger Froning . 2014. Energy-Efficient Collective Reduce and Allreduce Operations on Distributed GPUs . In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid) . Institute of Electrical and Electronics Engineers (IEEE) , 483--492. Lena Oden, Benjamin Klenk, and Holger Froning. 2014. Energy-Efficient Collective Reduce and Allreduce Operations on Distributed GPUs. In Intl. Symp. on Cluster, Cloud and Grid Computing (CCGrid). Institute of Electrical and Electronics Engineers (IEEE), 483--492."},{"volume-title":"Fine-grain Task Aggregation and Coordination on GPUs. In Intl. Symp. on Computer Architecture (ISCA). 181--192","author":"Orr Marc S.","key":"e_1_3_2_1_30_1","unstructured":"Marc S. Orr , Bradford M. Beckmann , Steven K. Reinhardt , and David A. Wood . 2014 . Fine-grain Task Aggregation and Coordination on GPUs. In Intl. Symp. on Computer Architecture (ISCA). 181--192 . http:\/\/dl.acm.org\/citation.cfm?id=2665671.2665701 Marc S. Orr, Bradford M. Beckmann, Steven K. Reinhardt, and David A. Wood. 2014. Fine-grain Task Aggregation and Coordination on GPUs. In Intl. Symp. on Computer Architecture (ISCA). 181--192. http:\/\/dl.acm.org\/citation.cfm?id=2665671.2665701"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126914"},{"volume-title":"GPU-Centric Communication on NVIDIA GPU Clusters with InfiniBand: A Case Study with OpenSHMEM. In Intl. Conf. on High Performance Computing (HiPC). 253--262","author":"Potluri S.","key":"e_1_3_2_1_32_1","unstructured":"S. Potluri , A. Goswami , D. Rossetti , C. J. Newburn , M. G. Venkata , and N. Imam . 2017 . GPU-Centric Communication on NVIDIA GPU Clusters with InfiniBand: A Case Study with OpenSHMEM. In Intl. Conf. on High Performance Computing (HiPC). 253--262 . S. Potluri, A. Goswami, D. Rossetti, C. J. Newburn, M. G. Venkata, and N. Imam. 2017. GPU-Centric Communication on NVIDIA GPU Clusters with InfiniBand: A Case Study with OpenSHMEM. In Intl. Conf. on High Performance Computing (HiPC). 253--262."},{"key":"e_1_3_2_1_33_1","unstructured":"Sreeram Potluri Nathan Luehr and Nikolay Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. http:\/\/on-demand-gtc.gputechconf.com\/gtc-quicklink\/7D7mU  Sreeram Potluri Nathan Luehr and Nikolay Sakharnykh. 2016. Simplifying Multi-GPU Communication with NVSHMEM. http:\/\/on-demand-gtc.gputechconf.com\/gtc-quicklink\/7D7mU"},{"key":"e_1_3_2_1_34_1","unstructured":"Davide Rossetti. 2015. GPUDirect Async. http:\/\/on-demand.gputechconf.com\/gtc\/2015\/presentation\/S5412-Davide-Rossetti.pdf  Davide Rossetti. 2015. GPUDirect Async. http:\/\/on-demand.gputechconf.com\/gtc\/2015\/presentation\/S5412-Davide-Rossetti.pdf"},{"key":"e_1_3_2_1_35_1","unstructured":"Sandia National Laboratories. 2017. The Portals 4.1 Network Programming Interface. http:\/\/www.cs.sandia.gov\/Portals\/portals41.pdf  Sandia National Laboratories. 2017. The Portals 4.1 Network Programming Interface. http:\/\/www.cs.sandia.gov\/Portals\/portals41.pdf"},{"key":"e_1_3_2_1_36_1","unstructured":"Sandia National Laboratories. 2018. Sandia OpenSHMEM. https:\/\/github.com\/Sandia-OpenSHMEM\/SOS  Sandia National Laboratories. 2018. Sandia OpenSHMEM. https:\/\/github.com\/Sandia-OpenSHMEM\/SOS"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161065"},{"key":"e_1_3_2_1_38_1","unstructured":"TACC. 2015. Stampede Supercomputer User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/stampede  TACC. 2015. Stampede Supercomputer User Guide. https:\/\/portal.tacc.utexas.edu\/user-guides\/stampede"},{"key":"e_1_3_2_1_39_1","unstructured":"TOP500.org. 2018. Highlights - June 2018. https:\/\/www.top500.org\/lists\/2018\/06\/highlights\/  TOP500.org. 2018. Highlights - June 2018. https:\/\/www.top500.org\/lists\/2018\/06\/highlights\/"}],"event":{"name":"PACT '18: International conference on Parallel Architectures and Compilation Techniques","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IFIP WG 10.3 IFIP WG 10.3","IEEE CS"],"location":"Limassol Cyprus","acronym":"PACT '18"},"container-title":["Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3243176.3243179","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3243176.3243179","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3243176.3243179","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:57:39Z","timestamp":1750208259000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3243176.3243179"}},"subtitle":["command processor networking for efficient intra-kernel communications on GPUs"],"short-title":[],"issued":{"date-parts":[[2018,11]]},"references-count":39,"alternative-id":["10.1145\/3243176.3243179","10.1145\/3243176"],"URL":"https:\/\/doi.org\/10.1145\/3243176.3243179","relation":{},"subject":[],"published":{"date-parts":[[2018,11]]},"assertion":[{"value":"2018-11-01","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}