{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,5]],"date-time":"2026-03-05T15:33:27Z","timestamp":1772724807565,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T00:00:00Z","timestamp":1554336000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"DARPA contract","award":["FA8750-16-2-0004"],"award-info":[{"award-number":["FA8750-16-2-0004"]}]},{"DOI":"10.13039\/100000001","name":"NSF","doi-asserted-by":"publisher","award":["CCF-1823546"],"award-info":[{"award-number":["CCF-1823546"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2019,4,4]]},"DOI":"10.1145\/3297858.3304055","type":"proceedings-article","created":{"date-parts":[[2019,4,4]],"date-time":"2019-04-04T18:38:43Z","timestamp":1554403123000},"page":"793-806","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":13,"title":["Fast Fine-Grained Global Synchronization on GPUs"],"prefix":"10.1145","author":[{"given":"Kai","family":"Wang","sequence":"first","affiliation":[{"name":"University of Texas at Austin, AUSTIN, TX, USA"}]},{"given":"Don","family":"Fussell","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, AUSTIN, TX, USA"}]},{"given":"Calvin","family":"Lin","sequence":"additional","affiliation":[{"name":"University of Texas at Austin, Austin, TX, USA"}]}],"member":"320","published-online":{"date-parts":[[2019,4,4]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2011.87"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195669"},{"key":"e_1_3_2_1_3_1","volume-title":"Proc. KDD Cup. 3--6.","author":"Bennett James","year":"2007","unstructured":"James Bennett and Stan Lanning. 2007. The Netflix Prize. In Proc. KDD Cup. 3--6."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"e_1_3_2_1_5_1","volume-title":"GPU Computing Gems Emerald Edition .Morgan Kaufmann","author":"Burtscher Martin","unstructured":"Martin Burtscher and Keshav Pingali. 2011. GPU Computing Gems Emerald Edition .Morgan Kaufmann, Chapter 6, An Efficient CUDA Implementation of the Tree-Based Barnes Hut n-Body Algorithm, 75--92."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-03850-6_7"},{"key":"e_1_3_2_1_7_1","volume-title":"2016 IEEE International Symposium on High Performance Computer Architecture (HPCA). 274--284","author":"Chen S.","unstructured":"S. Chen and L. Peng. 2016. Efficient GPU hardware transactional memory through early conflict resolution. In 2016 IEEE International Symposium on High Performance Computer Architecture (HPCA). 274--284."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080204"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.5555\/3195638.3195652"},{"key":"e_1_3_2_1_10_1","volume-title":"Warp Scheduling for Fine-Grained Synchronization. In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 375--388","author":"ElTantawy A.","unstructured":"A. ElTantawy and T. M. Aamodt. 2018. Warp Scheduling for Fine-Grained Synchronization. In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 375--388."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145849"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540743"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155655"},{"key":"e_1_3_2_1_14_1","unstructured":"GroupLens Research. 2006. MovieLens Data Sets. http:\/\/www.grouplens.org\/node\/73."},{"key":"e_1_3_2_1_15_1","volume-title":"2010 IEEE International Symposium on Parallel Distributed Processing (IPDPS). 1--10","author":"He Z.","unstructured":"Z. He and B. Hong. 2010. Dynamically tuned push-relabel algorithm for the maximum flow problem on CPU-GPU-Hybrid platforms. In 2010 IEEE International Symposium on Parallel Distributed Processing (IPDPS). 1--10."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2508148.2485940"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/1810479.1810540"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"David S. Johnson and Catherine C. McGeoch (Eds.). 1993. Network Flows and Matching: First DIMACS Implementation Challenge .American Mathematical Society Boston MA USA.","DOI":"10.5555\/562474"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2716282.2716289"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.5555\/1005332.1005345"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751232"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750396"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"L. Liu M. Liu C. J. Wang and J. Wang. 2016. Compile-Time Automatic Synchronization Insertion and Redundant Synchronization Elimination for GPU Kernels. In 2016 IEEE 22nd International Conference on Parallel and Distributed Systems (ICPADS). 826--834.","DOI":"10.1109\/ICPADS.2016.0112"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/2342821.2342827"},{"key":"e_1_3_2_1_25_1","unstructured":"NVIDIA. 2016. GeForce GTX 1080 Whitepaper. https:\/\/international.download.nvidia.com\/geforce-com\/international\/pdfs\/GeForce_GTX_1080_Whitepaper_FINAL.pdf"},{"key":"e_1_3_2_1_26_1","unstructured":"NVIDIA. 2017. NVIDIA TESLA V100 GPU ARCHITECTURE. http:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/volta-architecture-whitepaper.pdf"},{"key":"e_1_3_2_1_27_1","unstructured":"NVIDIA. 2018. Nvidia Profiler User Guide. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html"},{"key":"e_1_3_2_1_28_1","unstructured":"NVIDIA. 2018. Tuning CUDA Applications for Pascal. https:\/\/docs.nvidia.com\/cuda\/pascal-tuning-guide\/index.html"},{"key":"e_1_3_2_1_29_1","unstructured":"Y. Oyama K. Taura and A. Yonezawa. {n. d.}. EXECUTING PARALLEL PROGRAMS WITH SYNCHRONIZATION BOTTLENECKS EFFICIENTLY."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/2983990.2984015"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/2858652"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2882903.2882958"},{"key":"e_1_3_2_1_33_1","volume-title":"2017 IEEE International Symposium on High Performance Computer Architecture (HPCA). 625--636","author":"Ren X.","unstructured":"X. Ren and M. Lis. 2017. Efficient Sequential Consistency in GPUs via Relativistic Cache Coherence. In 2017 IEEE International Symposium on High Performance Computer Architecture (HPCA). 625--636."},{"key":"e_1_3_2_1_34_1","volume-title":"2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 235--246","author":"Ren X.","unstructured":"X. Ren and M. Lis. 2018. High-Performance GPU Transactional Memory via Eager Conflict Detection. In 2018 IEEE International Symposium on High Performance Computer Architecture (HPCA). 235--246."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830821"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/2830772.2830778"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/2528521.1508274"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2010.12"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/1320302.1320834"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/2903150.2903155"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2581122.2544139"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.82"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250662.1250668"}],"event":{"name":"ASPLOS '19: Architectural Support for Programming Languages and Operating Systems","location":"Providence RI USA","acronym":"ASPLOS '19","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the Twenty-Fourth International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3297858.3304055","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3297858.3304055","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3297858.3304055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:53:15Z","timestamp":1750204395000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3297858.3304055"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,4,4]]},"references-count":44,"alternative-id":["10.1145\/3297858.3304055","10.1145\/3297858"],"URL":"https:\/\/doi.org\/10.1145\/3297858.3304055","relation":{},"subject":[],"published":{"date-parts":[[2019,4,4]]},"assertion":[{"value":"2019-04-04","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}