{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T07:18:12Z","timestamp":1769843892023,"version":"3.49.0"},"reference-count":72,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018,9]]},"DOI":"10.1109\/iiswc.2018.8573483","type":"proceedings-article","created":{"date-parts":[[2018,12,14]],"date-time":"2018-12-14T00:48:21Z","timestamp":1544748501000},"page":"191-202","source":"Crossref","is-referenced-by-count":49,"title":["Tartan: Evaluating Modern GPU Interconnect via a Multi-GPU Benchmark Suite"],"prefix":"10.1109","author":[{"given":"Ang","family":"Li","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shuaiwen Leon","family":"Song","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jieyang","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xu","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nathan","family":"Tallent","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kevin","family":"Barker","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339595"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2009.4919648"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2017.259"},{"key":"ref38","first-page":"315","article-title":"Numerical experiments in homogeneous turbulence, NASA Tech","volume":"81","author":"rogallo","year":"1981","journal-title":"memo"},{"key":"ref33","year":"0","journal-title":"A CUDA implementation of the PageRank Pipeline Benchmark"},{"key":"ref32","year":"0","journal-title":"GPU implementation of classical molecular dynamics proxy application"},{"key":"ref31","year":"0","journal-title":"Livermore Unstructured Lagrangian Explicit Shock Hydrodynamics (LULESH)"},{"key":"ref30","author":"diaz","year":"0","journal-title":"Multi-GPU (CUDA-MPI) baseline implementation of Heat Equation and the inviscid Burgers' equation"},{"key":"ref37","year":"0","journal-title":"GPU implementation of classical molecular dynamics proxy application"},{"key":"ref36","author":"ferreiro","year":"0","journal-title":"Cusimann An Optimized Simulated Annealing Software for GPUs"},{"key":"ref35","author":"agarwal","year":"0","journal-title":"Multi-GPU Matrix Multiplication using CUDA and MPI"},{"key":"ref34","author":"mart\u00edn","year":"0","journal-title":"HIT a parallel GPGPU code to simulate Homogeneous Isotropic Turbulence"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751218"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1007\/s00450-011-0171-3"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2018.00034"},{"key":"ref63","doi-asserted-by":"crossref","DOI":"10.1109\/SC.2016.51","article-title":"dCUDA: hardware supported overlap of computation and communication","author":"gysi","year":"2016","journal-title":"International Conference for High Performance Computing Networking Storage and Analysis (SC)"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1016\/j.cpc.2016.04.007"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2017.94"},{"key":"ref27","author":"dimitrov","year":"0","journal-title":"Multi-GPU Code to Count all PLANAR Lang-ford Sequences"},{"key":"ref65","article-title":"Optimized Broadcast for Deep Learning Workloads on Dense-GPU InfiniBand Clusters: MPI or NCCL?","author":"awan","year":"2017","journal-title":"arXiv preprint arXiv 1707 07816"},{"key":"ref66","article-title":"Performance Modeling and Evaluation of Distributed Deep Learning Frameworks on GPUs","author":"shi","year":"2017","journal-title":"arXiv preprint arXiv 1711 03890"},{"key":"ref29","author":"liu","year":"0","journal-title":"Efficient Large-scale Parallel Stencil Computation on Multi-Core and Multi-GPU Accelerated Clusters"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref68","article-title":"Parboil: A revised benchmark suite for scientific and commercial throughput computing","author":"stratton","year":"2012","journal-title":"Center for Reliable and High-Performance Computing"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"ref2","author":"fuhrer","year":"0","journal-title":"Near-global climate simulation at 1 km resolution establishing a performance baseline on 4888 GPUs with COSMO 5 0"},{"key":"ref1","article-title":"Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour","author":"goyal","year":"2017","journal-title":"arXiv preprint arXiv 1706 02677"},{"key":"ref20","year":"0","journal-title":"Summit Early Access Development Platform"},{"key":"ref22","year":"0","journal-title":"Developing a Linux Kernel Module Using RDMAfor GPUDirect"},{"key":"ref21","year":"0","journal-title":"Summit The Next Leap in Leadership-Class Computing Systems for Open Science"},{"key":"ref24","author":"ferreiro","year":"0","journal-title":"Cusimann An Optimized Simulated Annealing Software for GPUs"},{"key":"ref23","year":"0","journal-title":"High-Performance C++\/CUDA Implementation of Convolutional Neural Networks Version-2"},{"key":"ref26","year":"0","journal-title":"Kmeans Clustering with Multi-GPU Capabilities"},{"key":"ref25","author":"pangborn","year":"0","journal-title":"Expectation Maximization with a Gaussian Mixture Model using CUDA"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751232"},{"key":"ref51","article-title":"SFU-driven transparent approximation acceleration on GPUs","author":"li","year":"2016","journal-title":"International Conference on Supercomputing (ICS)"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807611"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2013.222"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.55"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/1964179.1964194"},{"key":"ref55","article-title":"Warp-consolidation: A novel execution model for gpus","author":"li","year":"2018","journal-title":"International Conference on Supercomputing (ICS)"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3093315.3037709"},{"key":"ref53","article-title":"A synchronization-free algorithm for parallel sparse triangular solves","author":"liu","year":"2016","journal-title":"European Conference on Parallel Processing (EUROPAR)"},{"key":"ref52","article-title":"Critical points based register-concurrency autotuning for GPUs","author":"li","year":"2016","journal-title":"Design Automation and Test in Europe Conference"},{"key":"ref10","year":"0","journal-title":"Developing a Linux Kernel Module Using RDMAfor GPUDirect"},{"key":"ref11","year":"0","journal-title":"ROCm Driver RDMA Peer to Peer Support"},{"key":"ref40","article-title":"Evaluating on-node gpu interconnects for deep learning workloads","author":"tallent","year":"2017","journal-title":"International Workshop on Performance Modeling Benchmarking and Simulation of High Performance Computer Systems"},{"key":"ref12","year":"2015","journal-title":"CUDA SDK Code Samples"},{"key":"ref13","year":"0","journal-title":"NCCL Tests"},{"key":"ref14","year":"0","journal-title":"NVIDIA Collective Communications Library (NCCL)"},{"key":"ref15","year":"0","journal-title":"NVIDIA Collective Communications Library (NCCL)"},{"key":"ref16","year":"0","journal-title":"MPI-GPU-BW"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1287\/opre.9.3.383"},{"key":"ref18","year":"0","journal-title":"Cirrascale SR3514 Unexpected Performance Inequality Technical Brief M901A-092014"},{"key":"ref19","year":"0","journal-title":"ROCm Communication Collectives Library (RCCL)"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2010.01769.x"},{"key":"ref3","year":"2015","journal-title":"NVIDIA DGX-1 System Architecture White Paper"},{"key":"ref6","year":"0","journal-title":"The System Bottleneck Shifts to PCI-Express"},{"key":"ref5","article-title":"Graph processing on GPU: Where are the bottlenecks?","author":"xu","year":"2014","journal-title":"International Symposium on Workload Characterization (IISWC)"},{"key":"ref8","article-title":"Introduction to infiniband for end users","author":"grun","year":"2010","journal-title":"White paper Infini-Band Trade Association"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807606"},{"key":"ref9","year":"2018","journal-title":"CUDA Programming Guide"},{"key":"ref46","year":"0","journal-title":"HIP Convert CUDA to Portable C++ Code"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126931"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.89"},{"key":"ref47","article-title":"Transit: A visual analytical model for multithreaded machines","author":"li","year":"2015","journal-title":"International Symposium on High Performance Distributed Computing (HPDC)"},{"key":"ref42","doi-asserted-by":"crossref","first-page":"28","DOI":"10.1016\/j.jpdc.2017.12.007","article-title":"Gpudirect async: Exploring gpu synchronous communication techniques for infiniband clusters","volume":"114","author":"agostini","year":"2018","journal-title":"Journal of Parallel and Distributed Computing"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/HiPC.2017.00037"},{"key":"ref44","author":"jeffers","year":"2013","journal-title":"Intel Xeon Phi Coprocessor High Performance Programming"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1145\/3126908.3126950"}],"event":{"name":"2018 IEEE International Symposium on Workload Characterization (IISWC)","location":"Raleigh, NC","start":{"date-parts":[[2018,9,30]]},"end":{"date-parts":[[2018,10,2]]}},"container-title":["2018 IEEE International Symposium on Workload Characterization (IISWC)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/8554060\/8573472\/08573483.pdf?arnumber=8573483","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,1,27]],"date-time":"2022-01-27T11:02:03Z","timestamp":1643281323000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8573483\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,9]]},"references-count":72,"URL":"https:\/\/doi.org\/10.1109\/iiswc.2018.8573483","relation":{},"subject":[],"published":{"date-parts":[[2018,9]]}}}