{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T04:14:52Z","timestamp":1750306492379,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":53,"publisher":"ACM","license":[{"start":{"date-parts":[[2015,12,5]],"date-time":"2015-12-05T00:00:00Z","timestamp":1449273600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CAREER - 1149773"],"award-info":[{"award-number":["CAREER - 1149773"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2015,12,5]]},"DOI":"10.1145\/2830772.2830778","type":"proceedings-article","created":{"date-parts":[[2016,1,11]],"date-time":"2016-01-11T13:38:13Z","timestamp":1452519493000},"page":"699-712","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Efficiently enforcing strong memory ordering in GPUs"],"prefix":"10.1145","author":[{"given":"Abhayendra","family":"Singh","sequence":"first","affiliation":[{"name":"University of Michigan, Ann Arbor"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shaizeen","family":"Aga","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Satish","family":"Narayanasamy","sequence":"additional","affiliation":[{"name":"University of Michigan, Ann Arbor"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2015,12,5]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"\"NVIDIA CUDA v7.0 Developer Guide \" http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html.  \"NVIDIA CUDA v7.0 Developer Guide \" http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_1_2_1","first-page":"l1","article-title":"The OpenCL specification","volume":"1","author":"Munshi A.","year":"2009","unstructured":"A. Munshi , \" The OpenCL specification ,\" Khronos OpenCL Working Group , vol. 1 , pp. l1 -- 15 , 2009 . A. Munshi et al., \"The OpenCL specification,\" Khronos OpenCL Working Group, vol. 1, pp. l1--15, 2009.","journal-title":"Khronos OpenCL Working Group"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/2694344.2694391"},{"key":"e_1_3_2_1_4_1","first-page":"0","article-title":"The OpenCL specification","volume":"2","author":"Munshi A.","year":"2013","unstructured":"A. Munshi , \" The OpenCL specification ,\" Khronos OpenCL Working Group , vol. 2 . 0 , 2013 . A. Munshi et al., \"The OpenCL specification,\" Khronos OpenCL Working Group, vol. 2.0, 2013.","journal-title":"Khronos OpenCL Working Group"},{"key":"e_1_3_2_1_5_1","unstructured":"ISO\/IEC 9899:2011 \"Programming language C \" http:\/\/www.iso.org\/iso\/iso_catalogue\/catalogue_tc\/catalogue_detail.htm?csnumber=57853 2011.  ISO\/IEC 9899:2011 \"Programming language C \" http:\/\/www.iso.org\/iso\/iso_catalogue\/catalogue_tc\/catalogue_detail.htm?csnumber=57853 2011."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/325164.325100"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541981"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485940"},{"key":"e_1_3_2_1_9_1","first-page":"355","article-title":"Two Techniques to Enhance the Performance of Memory Consistency Models","author":"Gharachorloo K.","year":"1991","unstructured":"K. Gharachorloo , A. Gupta , and J. Hennessy , \" Two Techniques to Enhance the Performance of Memory Consistency Models ,\" in ICPP , 1991 , pp. 355 -- 364 . K. Gharachorloo, A. Gupta, and J. Hennessy, \"Two Techniques to Enhance the Performance of Memory Consistency Models,\" in ICPP, 1991, pp. 355--364.","journal-title":"ICPP"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/42190.42277"},{"key":"e_1_3_2_1_11_1","first-page":"524","volume-title":"End-to-end Sequential Consistency,\" in ISCA, june","author":"Singh A.","year":"2012","unstructured":"A. Singh , S. Narayanasamy , D. Marino , T. Millstein , and M. Musuvathi , \" End-to-end Sequential Consistency,\" in ISCA, june 2012 , pp. 524 -- 535 . A. Singh, S. Narayanasamy, D. Marino, T. Millstein, and M. Musuvathi, \"End-to-end Sequential Consistency,\" in ISCA, june 2012, pp. 524--535."},{"key":"e_1_3_2_1_12_1","first-page":"568","article-title":"Supporting x86-64 address translation for 100s of gpu lanes","author":"Power J.","year":"2014","unstructured":"J. Power , M. Hill , and D. Wood , \" Supporting x86-64 address translation for 100s of gpu lanes ,\" in HPCA , 2014 , pp. 568 -- 578 . J. Power, M. Hill, and D. Wood, \"Supporting x86-64 address translation for 100s of gpu lanes,\" in HPCA, 2014, pp. 568--578.","journal-title":"HPCA"},{"key":"e_1_3_2_1_13_1","unstructured":"\"Online companion material.\" {Online}. Available: http:\/\/www.eecs.umich.edu\/~ansingh\/micro15  \"Online companion material.\" {Online}. Available: http:\/\/www.eecs.umich.edu\/~ansingh\/micro15"},{"key":"e_1_3_2_1_14_1","volume-title":"White Paper,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf","author":"CUDA","year":"2009","unstructured":"NVIDIA, \"NVIDIA's next generation CUDA compute architecture : Fermi , White Paper,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf , 2009 . NVIDIA, \"NVIDIA's next generation CUDA compute architecture: Fermi, White Paper,\" http:\/\/www.nvidia.com\/content\/pdf\/fermi_white_papers\/nvidia_fermi_compute_architecture_whitepaper.pdf, 2009."},{"key":"e_1_3_2_1_15_1","unstructured":"AMD \"AMD Graphics Core Next \" http:\/\/www.amd.com\/Documents\/GCN_Architecture_whitepaper.pdf 2011.  AMD \"AMD Graphics Core Next \" http:\/\/www.amd.com\/Documents\/GCN_Architecture_whitepaper.pdf 2011."},{"key":"e_1_3_2_1_16_1","volume-title":"Now and in the Future","author":"Steele S.","year":"2011","unstructured":"S. Steele , \" ARM GP Us : Now and in the Future ,\" 2011 . {Online}. Available: http:\/\/www.arm.com\/files\/event\/8_steve_steele_arm_gpus_now_and_in_the_future.pdf S. Steele, \"ARM GPUs: Now and in the Future,\" 2011. {Online}. Available: http:\/\/www.arm.com\/files\/event\/8_steve_steele_arm_gpus_now_and_in_the_future.pdf"},{"key":"e_1_3_2_1_17_1","first-page":"434","article-title":"Memory access buffering in multiprocessors","author":"Dubois M.","year":"1986","unstructured":"M. Dubois , C. Scheurich , and F. A. Briggs , \" Memory access buffering in multiprocessors ,\" in ISCA , 1986 , pp. 434 -- 442 . M. Dubois, C. Scheurich, and F. A. Briggs, \"Memory access buffering in multiprocessors,\" in ISCA, 1986, pp. 434--442.","journal-title":"ISCA"},{"volume-title":"Kepler gk110,\" whitepaper","year":"2012","key":"e_1_3_2_1_18_1","unstructured":"NVIDIA, \"Nvidia's next generation cuda compute architecture : Kepler gk110,\" whitepaper , 2012 . {Online}. Available: www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf NVIDIA, \"Nvidia's next generation cuda compute architecture: Kepler gk110,\" whitepaper, 2012. {Online}. Available: www.nvidia.com\/content\/PDF\/kepler\/NVIDIA-Kepler-GK110-Architecture-Whitepaper.pdf"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/2.707614"},{"key":"e_1_3_2_1_20_1","volume-title":"Scheduler in multi-threaded processor prioritizing instructions passing qualification rule","author":"Mills P.","year":"2011","unstructured":"P. Mills , J. Lindholm , B. Coon , G. Tarolli , and J. Burgess , \" Scheduler in multi-threaded processor prioritizing instructions passing qualification rule ,\" May 24 2011 , uS Patent 7,949,855. {Online}. Available: http:\/\/www.google.com\/patents\/US7949855 P. Mills, J. Lindholm, B. Coon, G. Tarolli, and J. Burgess, \"Scheduler in multi-threaded processor prioritizing instructions passing qualification rule,\" May 24 2011, uS Patent 7,949,855. {Online}. Available: http:\/\/www.google.com\/patents\/US7949855"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.16"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2540708.2540718"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2451116.2451158"},{"key":"e_1_3_2_1_24_1","first-page":"272","article-title":"MRPB: Memory request prioritization for massively parallel processors","author":"Jia W.","year":"2014","unstructured":"W. Jia , K. Shaw , and M. Martonosi , \" MRPB: Memory request prioritization for massively parallel processors ,\" in HPCA , 2014 , pp. 272 -- 283 . W. Jia, K. Shaw, and M. Martonosi, \"MRPB: Memory request prioritization for massively parallel processors,\" in HPCA, 2014, pp. 272--283.","journal-title":"HPCA"},{"key":"e_1_3_2_1_25_1","first-page":"174","article-title":"Mascar: Speeding up gpu warps by reducing memory pitstops","author":"Sethia A.","year":"2015","unstructured":"A. Sethia , D. A. Jamshidi , and S. Mahlke , \" Mascar: Speeding up gpu warps by reducing memory pitstops ,\" in HPCA , 2015 , pp. 174 -- 185 . A. Sethia, D. A. Jamshidi, and S. Mahlke, \"Mascar: Speeding up gpu warps by reducing memory pitstops,\" in HPCA, 2015, pp. 174--185.","journal-title":"HPCA"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/237090.237142"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555779"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000076"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541942"},{"key":"e_1_3_2_1_30_1","unstructured":"R. Danilak \"System and method for hardware-based gpu paging to system memory \" Nov. 24 2009 uS Patent 7 623 134. {Online}. Available: http:\/\/www.google.com\/patents\/US7623134  R. Danilak \"System and method for hardware-based gpu paging to system memory \" Nov. 24 2009 uS Patent 7 623 134. {Online}. Available: http:\/\/www.google.com\/patents\/US7623134"},{"key":"e_1_3_2_1_31_1","volume-title":"Dedicated mechanism for page mapping in a gpu","author":"Tong P.","year":"2008","unstructured":"P. Tong , S. Yeoh , K. Kranzusch , G. Lorensen , K. Woo , A. Kaul , C. Case , S. Gottschalk , and D. Ma , \" Dedicated mechanism for page mapping in a gpu ,\" Jan. 31 2008 , uS Patent App . 11\/689,485. {Online}. Available: http:\/\/www.google.com\/patents\/US20080028181 P. Tong, S. Yeoh, K. Kranzusch, G. Lorensen, K. Woo, A. Kaul, C. Case, S. Gottschalk, and D. Ma, \"Dedicated mechanism for page mapping in a gpu,\" Jan. 31 2008, uS Patent App. 11\/689,485. {Online}. Available: http:\/\/www.google.com\/patents\/US20080028181"},{"key":"e_1_3_2_1_32_1","unstructured":"Intel \"Intel Virtualization Technology for Directed I\/O Architecture Specification \" 2006.  Intel \"Intel Virtualization Technology for Directed I\/O Architecture Specification \" 2006."},{"key":"e_1_3_2_1_33_1","unstructured":"AMD \"AMD I\/O Virtualization Technology (IOMMU) Specification \" 2006.  AMD \"AMD I\/O Virtualization Technology (IOMMU) Specification \" 2006."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-24322-6_22"},{"key":"e_1_3_2_1_35_1","first-page":"163","article-title":"Analyzing cuda workloads using a detailed gpu simulator","author":"Bakhoda A.","year":"2009","unstructured":"A. Bakhoda , G. L. Yuan , W. W. Fung , H. Wong , and T. M. Aamodt , \" Analyzing cuda workloads using a detailed gpu simulator ,\" in ISPASS , 2009 , pp. 163 -- 174 . A. Bakhoda, G. L. Yuan, W. W. Fung, H. Wong, and T. M. Aamodt, \"Analyzing cuda workloads using a detailed gpu simulator,\" in ISPASS, 2009, pp. 163--174.","journal-title":"ISPASS"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/1105734.1105747"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/2024716.2024718"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522351"},{"key":"e_1_3_2_1_39_1","first-page":"33","article-title":"Garnet: A detailed on-chip network model inside a full-system simulator","author":"Agarwal N.","year":"2009","unstructured":"N. Agarwal , T. Krishna , L.-S. Peh , and N. K. Jha , \" Garnet: A detailed on-chip network model inside a full-system simulator ,\" in ISPASS , 2009 , pp. 33 -- 42 . N. Agarwal, T. Krishna, L.-S. Peh, and N. K. Jha, \"Garnet: A detailed on-chip network model inside a full-system simulator,\" in ISPASS, 2009, pp. 33--42.","journal-title":"ISPASS"},{"key":"e_1_3_2_1_40_1","first-page":"235","article-title":"Demystifying gpu microarchitecture through microbenchmarking","author":"Wong H.","year":"2010","unstructured":"H. Wong , M.-M. Papadopoulou , M. Sadooghi-Alvandi , and A. Moshovos , \" Demystifying gpu microarchitecture through microbenchmarking ,\" in ISPASS , 2010 , pp. 235 -- 246 . H. Wong, M.-M. Papadopoulou, M. Sadooghi-Alvandi, and A. Moshovos, \"Demystifying gpu microarchitecture through microbenchmarking,\" in ISPASS, 2010, pp. 235--246.","journal-title":"ISPASS"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"e_1_3_2_1_42_1","first-page":"1","volume-title":"May 2012","author":"Grauer-Gray S.","unstructured":"S. Grauer-Gray , L. Xu , R. Searles , S. Ayalasomayajula , and J. Cavazos , \" Auto-tuning a high-level language targeted to gpu codes,\" in Innovative Parallel Computing (InPar), 2012 , May 2012 , pp. 1 -- 10 . S. Grauer-Gray, L. Xu, R. Searles, S. Ayalasomayajula, and J. Cavazos, \"Auto-tuning a high-level language targeted to gpu codes,\" in Innovative Parallel Computing (InPar), 2012, May 2012, pp. 1--10."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2012.6189209"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/258492.258512"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/300979.300993"},{"key":"e_1_3_2_1_46_1","volume-title":"Speculative Sequential Consistency with Little Custom Storage,\" in PACT","author":"Gniady C.","year":"2002","unstructured":"C. Gniady and B. Falsafi , \" Speculative Sequential Consistency with Little Custom Storage,\" in PACT , 2002 . C. Gniady and B. Falsafi, \"Speculative Sequential Consistency with Little Custom Storage,\" in PACT, 2002."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555785"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250662.1250696"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/1250662.1250697"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/1024393.1024395"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2150976.2151006"},{"key":"e_1_3_2_1_52_1","first-page":"404","article-title":"Atomic sc for simple in-order processors","author":"Gope D.","year":"2014","unstructured":"D. Gope and M. Lipasti , \" Atomic sc for simple in-order processors ,\" in HPCA , 2014 , pp. 404 -- 415 . D. Gope and M. Lipasti, \"Atomic sc for simple in-order processors,\" in HPCA, 2014, pp. 404--415.","journal-title":"HPCA"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751211"}],"event":{"name":"MICRO-48: The 48th Annual IEEE\/ACM International Symposium of Microarchitecture","sponsor":["IEEE Computer Society TC-uARCH","SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing"],"location":"Waikiki Hawaii","acronym":"MICRO-48"},"container-title":["Proceedings of the 48th International Symposium on Microarchitecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2830772.2830778","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/2830772.2830778","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T05:48:39Z","timestamp":1750225719000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/2830772.2830778"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,12,5]]},"references-count":53,"alternative-id":["10.1145\/2830772.2830778","10.1145\/2830772"],"URL":"https:\/\/doi.org\/10.1145\/2830772.2830778","relation":{},"subject":[],"published":{"date-parts":[[2015,12,5]]},"assertion":[{"value":"2015-12-05","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}