{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:27:37Z","timestamp":1750220857960,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2020,2,22]],"date-time":"2020-02-22T00:00:00Z","timestamp":1582329600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2020,2,22]]},"DOI":"10.1145\/3368826.3377911","type":"proceedings-article","created":{"date-parts":[[2020,2,21]],"date-time":"2020-02-21T21:49:28Z","timestamp":1582321768000},"page":"121-132","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["Speculative reconvergence for improved SIMT efficiency"],"prefix":"10.1145","author":[{"given":"Sana","family":"Damani","sequence":"first","affiliation":[{"name":"Georgia Institute of Technology, USA"}]},{"given":"Daniel R.","family":"Johnson","sequence":"additional","affiliation":[{"name":"NVIDIA, USA"}]},{"given":"Mark","family":"Stephenson","sequence":"additional","affiliation":[{"name":"NVIDIA, USA"}]},{"given":"Stephen W.","family":"Keckler","sequence":"additional","affiliation":[{"name":"NVIDIA, USA"}]},{"given":"Eddie","family":"Yan","sequence":"additional","affiliation":[{"name":"University of Washington, USA"}]},{"given":"Michael","family":"McKeown","sequence":"additional","affiliation":[{"name":"Esperanto Technologies, USA"}]},{"given":"Olivier","family":"Giroux","sequence":"additional","affiliation":[{"name":"NVIDIA, USA"}]}],"member":"320","published-online":{"date-parts":[[2020,2,22]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/1572769.1572792"},{"key":"e_1_3_2_1_2_1","volume-title":"Tianyi David Han, Jonathan Rose, Stefan Andersson-Engels, and Lothar Lilge.","author":"Alerstam Erik","year":"2010","unstructured":"Erik Alerstam, William Chun Yip Lo, Tianyi David Han, Jonathan Rose, Stefan Andersson-Engels, and Lothar Lilge. 2010. Next-generation Acceleration and Code Optimization for Light Transport in Turbid Media Using GPUs. Biomedical Optics Express 1, 2 (2010)."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/NSSMIC.2009.5402382"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1016\/0304-405X(77)90005-8"},{"key":"e_1_3_2_1_5_1","volume-title":"MCNP: A General Monte Carlo N-Particle Transport Code.","author":"Briesmeister Judith F.","year":"2000","unstructured":"Judith F. Briesmeister. 2000. MCNP: A General Monte Carlo N-Particle Transport Code. (2000)."},{"key":"e_1_3_2_1_6_1","volume-title":"Volta: Performance and Programmability","author":"Choquette Jack","year":"2018","unstructured":"Jack Choquette, Olivier Giroux, and Denis Foley. 2018. Volta: Performance and Programmability. IEEE Micro 38, 2 (2018)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2155620.2155676"},{"key":"e_1_3_2_1_8_1","volume-title":"SIMT Microscheduling: Reducing Thread Stalling in Divergent Iterative Algorithms. In Euromicro International Conference on Parallel, Distributed and Networkbased Processing .","author":"Frey Steffen","year":"2012","unstructured":"Steffen Frey, Guido Reina, and Thomas Ertl. 2012. SIMT Microscheduling: Reducing Thread Stalling in Divergent Iterative Algorithms. In Euromicro International Conference on Parallel, Distributed and Networkbased Processing ."},{"volume-title":"Thread Block Compaction for Efficient SIMT Control Flow. In International Symposium on High Performance Computer Architecture (HPCA) .","author":"Wilson","key":"e_1_3_2_1_9_1","unstructured":"Wilson W.L. Fung and Tor M. Aamodt. 2011. Thread Block Compaction for Efficient SIMT Control Flow. In International Symposium on High Performance Computer Architecture (HPCA) ."},{"volume-title":"Dynamic Warp Formation and Scheduling for Efficient GPU Control Flow. In International Symposium on Microarchitecture (MICRO).","author":"Fung Wilson W.L.","key":"e_1_3_2_1_10_1","unstructured":"Wilson W.L. Fung, Ivan Sham, George Yuan, and T. M. Aamodt. 2007. Dynamic Warp Formation and Scheduling for Efficient GPU Control Flow. In International Symposium on Microarchitecture (MICRO)."},{"volume-title":"Workshop on General Purpose Processing on Graphics Processing Units (GPGPU) .","author":"Han Tianyi David","key":"e_1_3_2_1_11_1","unstructured":"Tianyi David Han and Tarek S. Abdelrahman. 2011. Reducing Branch divergence in GPU programs. In Workshop on General Purpose Processing on Graphics Processing Units (GPGPU) ."},{"volume-title":"Reducing Divergence in GPGPU Programs with Loop Merging. In Workshop on General Purpose Processor Using Graphics Processing Units (GPGPU) .","author":"Han Tianyi David","key":"e_1_3_2_1_12_1","unstructured":"Tianyi David Han and Tarek S. Abdelrahman. 2013. Reducing Divergence in GPGPU Programs with Loop Merging. In Workshop on General Purpose Processor Using Graphics Processing Units (GPGPU) ."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3204919.3204921"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"crossref","unstructured":"James T. Kajiya. 1986. The Rendering Equation. SIGGRAPH Comput. Graph. (1986).","DOI":"10.1145\/15922.15902"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.89"},{"key":"e_1_3_2_1_16_1","unstructured":"Lawrence Livermore National Labs. 2011. Monte Carlo Benchmark (MCB). https:\/\/codesign.llnl.gov\/mcb.php"},{"key":"e_1_3_2_1_17_1","volume-title":"NVIDIA Tesla: A Unified Graphics and Computing Architecture","author":"Lindholm Erik","year":"2008","unstructured":"Erik Lindholm, John Nickolls, Stuart Oberman, and John Montrym. 2008. NVIDIA Tesla: A Unified Graphics and Computing Architecture. IEEE Micro 28, 2 (2008)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"crossref","unstructured":"C-M. Ma J.S. Li T. Pawlicki S.B. Jiang J. Deng M.C. Lee T. Koumrian M. Luxton and S. Brain. 2002. A Monte Carlo Dose Calculation Tool for Radiotherapy Treatment Planning. Physics in Medicine and Biology 47 10 (2002).","DOI":"10.1088\/0031-9155\/47\/10\/305"},{"key":"e_1_3_2_1_19_1","volume-title":"Dynamic Warp Subdivision for Integrated Branch and Memory Divergence Tolerance. In International Symposium on Computer Architecture (ISCA).","author":"Meng Jiayuan","year":"2010","unstructured":"Jiayuan Meng, David Tarjan, and Kevin Skadron. 2010. Dynamic Warp Subdivision for Integrated Branch and Memory Divergence Tolerance. In International Symposium on Computer Architecture (ISCA)."},{"key":"e_1_3_2_1_20_1","unstructured":"NVIDIA. 2017. NVIDIA TESLA V100 GPU ARCHITECTURE. https:\/\/images.nvidia.com\/content\/volta-architecture\/pdf\/voltaarchitecture-whitepaper.pdf"},{"key":"e_1_3_2_1_21_1","unstructured":"NVIDIA. 2018. CUDA Binary Utilities Toolkit Documentation. https: \/\/docs.nvidia.com\/cuda\/cuda-binary-utilities\/index.html#volta"},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2019. Profiler User\u2019s Guide CUDA Toolkit Documentation. https:\/\/docs.nvidia.com\/cuda\/profiler-users-guide\/index.html"},{"key":"e_1_3_2_1_23_1","volume-title":"Austin Robison, and Martin Stich","author":"Parker Steven G.","year":"2010","unstructured":"Steven G. Parker, James Bigler, Andreas Dietrich, Heiko Friedrich, Jared Hoberock, David Luebke, David McAllister, Morgan McGuire, Keith Morley, Austin Robison, and Martin Stich. 2010. OptiX: A General Purpose Ray Tracing Engine. In ACM SIGGRAPH."},{"volume-title":"International Symposium on Computer Architecture (ISCA) .","author":"Rogers Timothy G.","key":"e_1_3_2_1_24_1","unstructured":"Timothy G. Rogers, Daniel R. Johnson, Mike O\u2019Connor, and Stephen W. Keckler. 2015. A Variable Warp Size Architecture. In International Symposium on Computer Architecture (ISCA) ."},{"key":"e_1_3_2_1_25_1","volume-title":"High-throughput Sequence Alignment Using Graphics Processing Units. BMC Bioinformatics 8","author":"Schatz Michael C.","year":"2007","unstructured":"Michael C. Schatz, Cole Trapnell, Arthur L Delcher, and Amitabh Varshney. 2007. High-throughput Sequence Alignment Using Graphics Processing Units. BMC Bioinformatics 8 (2007)."},{"key":"e_1_3_2_1_26_1","volume-title":"EASC 2014 - Solving Software Challenges for Exascale .","author":"Tramm John R.","year":"2014","unstructured":"John R. Tramm, Andrew R. Siegel, Benoit Forget, and Colin Josey. 2014. Performance Analysis of a Reduced Data Movement Algorithm for Neutron Cross Section Data in Monte Carlo Simulations. In EASC 2014 - Solving Software Challenges for Exascale ."},{"key":"e_1_3_2_1_27_1","unstructured":"John R. Tramm Andrew R. Siegel Tanzima Islam and Martin Shulz. 2014. XSBench \u2013 the Development and Verification of a Performance Abstraction for Monte Carlo Reactor Analysis. In The Role of Reactor Physics Toward a Sustainable Future (PHYSOR) ."},{"key":"e_1_3_2_1_28_1","volume-title":"Relaxing SIMD Control Flow Constraints Using Loop Transformations. In Conference on Programming Language Design and Implementation (PLDI) .","author":"von Hanxleden Reinhard","year":"1992","unstructured":"Reinhard von Hanxleden and Ken Kennedy. 1992. Relaxing SIMD Control Flow Constraints Using Loop Transformations. In Conference on Programming Language Design and Implementation (PLDI) ."},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Anti-Counterfeiting, Security, and Identification .","author":"Wu Hongwei","year":"2011","unstructured":"Hongwei Wu, Xiangnan Liu, and Weibin Tang. 2011. A Fast GPU-Based Implementation for MD5 Hash Reverse. In International Conference on Anti-Counterfeiting, Security, and Identification ."}],"event":{"name":"CGO '20: 18th ACM\/IEEE International Symposium on Code Generation and Optimization","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGMICRO ACM Special Interest Group on Microarchitectural Research and Processing","IEEE-CS Computer Society"],"location":"San Diego CA USA","acronym":"CGO '20"},"container-title":["Proceedings of the 18th ACM\/IEEE International Symposium on Code Generation and Optimization"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3368826.3377911","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3368826.3377911","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T23:23:28Z","timestamp":1750202608000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3368826.3377911"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,2,22]]},"references-count":29,"alternative-id":["10.1145\/3368826.3377911","10.1145\/3368826"],"URL":"https:\/\/doi.org\/10.1145\/3368826.3377911","relation":{},"subject":[],"published":{"date-parts":[[2020,2,22]]},"assertion":[{"value":"2020-02-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}