{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,14]],"date-time":"2026-03-14T19:09:55Z","timestamp":1773515395062,"version":"3.50.1"},"reference-count":109,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2023,1,1]]},"DOI":"10.1109\/tpds.2022.3218508","type":"journal-article","created":{"date-parts":[[2022,11,4]],"date-time":"2022-11-04T01:01:29Z","timestamp":1667523689000},"page":"275-290","source":"Crossref","is-referenced-by-count":14,"title":["Improving the Scalability of GPU Synchronization Primitives"],"prefix":"10.1109","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-9617-590X","authenticated-orcid":false,"given":"Preyesh","family":"Dalmia","sequence":"first","affiliation":[{"name":"University of Wisconsin-Madison, Madison, WI, USA"}]},{"given":"Rohan","family":"Mahapatra","sequence":"additional","affiliation":[{"name":"University of California, San Diego, La Jolla, CA, USA"}]},{"given":"Jeremy","family":"Intan","sequence":"additional","affiliation":[{"name":"University of Illinois at Urbana-Champaign, Champaign, IL, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1565-2784","authenticated-orcid":false,"given":"Dan","family":"Negrut","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison, Madison, WI, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0189-7895","authenticated-orcid":false,"given":"Matthew D.","family":"Sinclair","sequence":"additional","affiliation":[{"name":"University of Wisconsin-Madison, Madison, WI, USA"}]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/2889488"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2014.24"},{"key":"ref33","article-title":"HSA platform system architecture specification","year":"2015"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1007\/BF01379320"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835930"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1115\/1.4023915"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00038"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/WSCAD.2018.00013"},{"key":"ref28","article-title":"Cooperative groups: Flexible CUDA thread programming","author":"harris","year":"2017"},{"key":"ref27","article-title":"Optimizing parallel reduction in CUDA","volume":"2","author":"harris","year":"2007","journal-title":"NVIDIA Developer Technology"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1007\/11602569_6"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/INTERCON.2019.8853605"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2701618"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/NoCS.2013.6558404"},{"key":"ref24","first-page":"819","article-title":"Gustafson's law","author":"gustafson","year":"2011","journal-title":"Encyclopedia of Parallel Computing"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1145\/3293883.3295727"},{"key":"ref26","first-page":"1","article-title":"KLAP: Kernel launch aggregation and promotion for optimizing dynamic parallelism","author":"hajj","year":"2016","journal-title":"Proc IEEE\/ACM 49th Annu Int Symp Microarchit"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1145\/2851141.2851145"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00058"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2018.05.013"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1007\/s11044-011-9246-y"},{"key":"ref59","article-title":"CUDA programming guide","year":"2016"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.88"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1145\/3303084.3309488"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.5757"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1017\/S1431927605050968"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1145\/3456727.3463779"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1145\/360248.360253"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/103727.103729"},{"key":"ref40","article-title":"NVIDIA ampere architecture in-depth","author":"krashinsky","year":"2020"},{"key":"ref4","article-title":"AMD graphics core next (GCN) architecture, generation 3","year":"2016"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783729"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.1109\/71.80120"},{"key":"ref5","article-title":"OpenCL programming guide","year":"2016"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375591"},{"key":"ref7","first-page":"161","author":"bader","year":"2018","journal-title":"ch Benchmarking for Graph Clustering and Partitioning"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1145\/2735627"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751232"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00028"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/1040305.1040336"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304043"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1145\/361082.361093"},{"key":"ref41","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc 25th Int Conf Neural Inf Process Syst"},{"key":"ref44","article-title":"P1135R5: The C 20 synchronization library","author":"lelbach","year":"2019"},{"key":"ref43","author":"howes","year":"2015","journal-title":"The OpenCL Specification version 2 0"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1145\/800105.803398"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1016\/j.matcom.2011.11.005"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1103\/PhysRevE.96.042905"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694346"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.690"},{"key":"ref77","article-title":"YOLOv3: An incremental improvement","volume":"abs 1804 2767","author":"redmon","year":"2018","journal-title":"CoRR"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1016\/0020-0190(81)90106-X"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/3180270.3180271"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00054"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/REAL.1996.563705"},{"key":"ref60","article-title":"Pascal P100","year":"2016"},{"key":"ref62","article-title":"Cuda-GDB","year":"2017"},{"key":"ref61","article-title":"Pascal P102","year":"2016"},{"key":"ref63","article-title":"CUDA Stream Management","year":"2018"},{"key":"ref64","article-title":"libcu: The C standard library for your entire system","year":"2020"},{"key":"ref65","article-title":"Split arrive\/wait barrier","year":"2020"},{"key":"ref66","article-title":"Inside volta: The world's most advanced data center GPU","year":"2017"},{"key":"ref67","article-title":"Profiler user's guide","year":"2018"},{"key":"ref68","article-title":"Understanding and using atomic memory operations","author":"nyland","year":"2013","journal-title":"Proc GPU Technol Conf"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2012.6168946"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694350"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123976"},{"key":"ref109","article-title":"Sparse persistent RNNs: Squeezing large recurrent networks on-chip","author":"zhu","year":"2018","journal-title":"Proc 6th Int Conf Learn Representations"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/RTOSS.1994.292571"},{"key":"ref94","article-title":"Efficient synchronization primitives for GPUs","author":"stuart","year":"2011"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00057"},{"key":"ref93","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC47752.2019.9042139"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001161"},{"key":"ref92","doi-asserted-by":"publisher","DOI":"10.1145\/2983990.2984032"},{"key":"ref106","first-page":"1","article-title":"Inter-block GPU communication via fast barrier synchronization","author":"xiao","year":"2010","journal-title":"Proc IEEE Int Parallel Distrib Process Symp"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/s10586-017-0768-9"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835939"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037742"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080203"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446078"},{"key":"ref98","first-page":"1","article-title":"Chainer: A. next-generation open source framework for deep learning","author":"tokui","year":"2015","journal-title":"Proc Workshop Mach Learn Syst 29th Annu Conf Neural Inf Process Syst"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2016.73"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1016\/0743-7315(90)90022-H"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-40361-8_2"},{"key":"ref10","first-page":"57","article-title":"On dynamic load balancing on graphics processors","author":"cederman","year":"2008","journal-title":"Proc 23rd ACM SIGGRAPH\/EUROGRAPHICS Symp Graph Hardware"},{"key":"ref11","article-title":"NVIDIA cuda samples","author":"center","year":"2022"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2013.6704684"},{"key":"ref13","article-title":"9th DIMACS implementation challenge","author":"demetrescu","year":"2006"},{"key":"ref14","first-page":"2024","article-title":"Persistent RNNs: Stashing recurrent weights on-chip","author":"diamos","year":"2016","journal-title":"Proc 33nd Int Conf Mach Learn"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1007\/978-94-009-7893-5_15"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00087"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-48096-0_45"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1145\/2602988.2602993"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/EMRTS.2003.1212739"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783714"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/800015.808203"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2011.34"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v29i1.9277"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/329466.329486"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2017.8167781"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS48437.2020.00027"},{"key":"ref86","first-page":"97","article-title":"Scan primitives for GPU computing","author":"sengupta","year":"0","journal-title":"Proc 22nd ACM SIGGRAPH\/EUROGRAPHICS Symp Graph Hardware"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830821"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080206"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/9956797\/09933620.pdf?arnumber=9933620","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,12]],"date-time":"2022-12-12T19:32:14Z","timestamp":1670873534000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9933620\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":109,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2022.3218508","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,1]]}}}