{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T14:25:35Z","timestamp":1773843935225,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":33,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,8,29]],"date-time":"2022-08-29T00:00:00Z","timestamp":1661731200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["CCF-2225233;CCF-2144523"],"award-info":[{"award-number":["CCF-2225233;CCF-2144523"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,8,29]]},"DOI":"10.1145\/3545008.3545091","type":"proceedings-article","created":{"date-parts":[[2023,1,15]],"date-time":"2023-01-15T01:04:08Z","timestamp":1673744648000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":37,"title":["From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus"],"prefix":"10.1145","author":[{"given":"Dian-Lun","family":"Lin","sequence":"first","affiliation":[{"name":"University of Utah, United States of America"}]},{"given":"Haoxing","family":"Ren","sequence":"additional","affiliation":[{"name":"Nvidia Research, United States of America"}]},{"given":"Yanqing","family":"Zhang","sequence":"additional","affiliation":[{"name":"Nvidia Research, United States of America"}]},{"given":"Brucek","family":"Khailany","sequence":"additional","affiliation":[{"name":"Nvidia Research, United States of America"}]},{"given":"Tsung-Wei","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Utah, United States of America"}]}],"member":"320","published-online":{"date-parts":[[2023,1,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2012. Nvidia System Management Interface. https:\/\/developer.nvidia.com\/nvidia-system-management-interface."},{"key":"e_1_3_2_1_2_1","unstructured":"2012. Yosys. https:\/\/yosyshq.net\/yosys\/."},{"key":"e_1_3_2_1_3_1","unstructured":"2016. Spinal. https:\/\/github.com\/SpinalHDL\/VexRiscv."},{"key":"e_1_3_2_1_4_1","unstructured":"2017. Nvidia Deep Learning Accelerator Design (NVDLA). http:\/\/nvdla.org\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2017. Nvidia Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems."},{"key":"e_1_3_2_1_6_1","unstructured":"2018. riscv-mini. https:\/\/github.com\/ucb-bar\/riscv-mini."},{"key":"e_1_3_2_1_7_1","unstructured":"2019. CUDA Graph. https:\/\/docs.nvidia.com\/cuda\/cuda-runtime-api\/group__CUDART__GRAPH.html."},{"key":"e_1_3_2_1_8_1","unstructured":"2022. RTLflow. https:\/\/github.com\/dian-lun-lin\/RTLflow."},{"key":"e_1_3_2_1_9_1","volume-title":"Efficiently exploiting low activity factors to accelerate RTL simulation","author":"Beamer Scott","unstructured":"Scott Beamer and David Donofrio. 2020. Efficiently exploiting low activity factors to accelerate RTL simulation. In ACM\/IEEE DAC. 1\u20136."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/1970353.1970363"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"crossref","unstructured":"Cheng-Hsiang Chiu and Tsung-Wei Huang. 2022. Composing Pipeline Parallelism Using Control Taskflow Graph. In ACM HPDC. 283\u2013\u2013284.","DOI":"10.1145\/3502181.3533714"},{"key":"e_1_3_2_1_12_1","volume-title":"Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms","author":"Chiu Cheng-Hsiang","unstructured":"Cheng-Hsiang Chiu and Tsung-Wei Huang. 2022. Efficient Timing Propagation with Simultaneous Structural and Pipeline Parallelisms. In ACM\/IEEE DAC."},{"key":"e_1_3_2_1_13_1","volume-title":"Markov chain Monte Carlo in practice","author":"Gilks R","unstructured":"Walter\u00a0R Gilks, Sylvia Richardson, and David Spiegelhalter. 1995. Markov chain Monte Carlo in practice. CRC press."},{"key":"e_1_3_2_1_14_1","volume-title":"GPU-accelerated Pash-based Timing Analysis","author":"Guo Guannan","unstructured":"Guannan Guo, Tsung-Wei Huang, Yibo Lin, and Martin Wong. 2021. GPU-accelerated Pash-based Timing Analysis. In ACM\/IEEE DAC."},{"key":"e_1_3_2_1_15_1","volume-title":"GPU-accelerated Static Timing Analysis","author":"Guo Zizheng","unstructured":"Zizheng Guo, Tsung-Wei Huang, and Yibo Lin. 2020. GPU-accelerated Static Timing Analysis. In IEEE\/ACM ICCAD. 1\u20138."},{"key":"e_1_3_2_1_16_1","volume-title":"Monte Carlo sampling methods using Markov chains and their applications","author":"Hastings W\u00a0Keith","unstructured":"W\u00a0Keith Hastings. 1970. Monte Carlo sampling methods using Markov chains and their applications. Oxford University Press."},{"key":"e_1_3_2_1_17_1","first-page":"776","article-title":"OpenTimer 2.0: A New Parallel Incremental Timing Analysis Engine","volume":"40","author":"Huang Tsung-Wei","year":"2021","unstructured":"Tsung-Wei Huang, Guannan Guo, Chun-Xun Lin, and Martin Wong. 2021. OpenTimer 2.0: A New Parallel Incremental Timing Analysis Engine. IEEE TCAD 40, 4 (2021), 776\u2013789.","journal-title":"IEEE TCAD"},{"key":"e_1_3_2_1_18_1","volume-title":"Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++","author":"Huang Tsung-Wei","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin Wong. 2019. Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++. In IEEE IPDPS. 974\u2013983."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3104255"},{"key":"e_1_3_2_1_20_1","volume-title":"OpenTimer: A high-performance timing analysis tool","author":"Huang Tsung-Wei","unstructured":"Tsung-Wei Huang and Martin Wong. 2015. OpenTimer: A high-performance timing analysis tool. In IEEE\/ACM ICCAD. 895\u2013902."},{"key":"e_1_3_2_1_21_1","first-page":"1","article-title":"A Distributed Timing Analysis Framework for Large Designs","volume":"116","author":"Huang Tsung-Wei","year":"2016","unstructured":"Tsung-Wei Huang, Martin D.\u00a0F. Wong, Debjit Sinha, Kerim Kalafala, and Natesan Venkateswaran. 2016. A Distributed Timing Analysis Framework for Large Designs. In ACM\/IEEE DAC. 116:1\u2013116:6.","journal-title":"ACM\/IEEE DAC."},{"key":"e_1_3_2_1_22_1","volume-title":"\u00a0F. Wong","author":"Lin Chun-Xun","year":"2020","unstructured":"Chun-Xun Lin, Tsung-Wei Huang, and Martin D.\u00a0F. Wong. 2020. An Efficient Work-Stealing Scheduler for Task Dependency Graph. In IEEE ICPADS. 64\u201371."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Dian-Lun Lin and Tsung-Wei Huang. 2021. Efficient GPU Computation using Task Graph Parallelism. In Euro-Par. 435\u2013450.","DOI":"10.1007\/978-3-030-85665-6_27"},{"key":"e_1_3_2_1_24_1","first-page":"3041","article-title":"Accelerating Large Sparse Neural Network Inference Using GPU Task Graph Parallelism","volume":"33","author":"Lin Dian-Lun","year":"2022","unstructured":"Dian-Lun Lin and Tsung-Wei Huang. 2022. Accelerating Large Sparse Neural Network Inference Using GPU Task Graph Parallelism. IEEE TPDS 33, 11 (2022), 3041\u20133052.","journal-title":"IEEE TPDS"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"Lingyi Liu and Shobha Vasudevan. 2011. Efficient validation input generation in RTL by hybridized source code analysis. In 2011 Design Automation Test in Europe. 1\u20136. https:\/\/doi.org\/10.1109\/DATE.2011.5763253","DOI":"10.1109\/DATE.2011.5763253"},{"key":"e_1_3_2_1_26_1","volume-title":"Accelerating RTL simulation with GPUs","author":"Qian Hao","unstructured":"Hao Qian and Yangdong Deng. 2011. Accelerating RTL simulation with GPUs. In IEEE\/ACM ICCAD. 687\u2013693."},{"key":"e_1_3_2_1_27_1","volume-title":"Partitioning and scheduling parallel programs for execution on multiprocessors. Ph.\u00a0D. Dissertation","author":"Sarkar Vivek","unstructured":"Vivek Sarkar. 1987. Partitioning and scheduling parallel programs for execution on multiprocessors. Ph.\u00a0D. Dissertation. Stanford University."},{"key":"e_1_3_2_1_28_1","unstructured":"Wilson Snyder. 2018. Verilator 4.0: open simulation goes multithreaded. https:\/\/veripool.org\/papers\/Verilator_v4_Multithreaded_OrConf2018.pdf."},{"key":"e_1_3_2_1_29_1","unstructured":"Uri Tal. 2013. RocketSim: A GPU-based Simulation Accelerator for Chip Verification. https:\/\/on-demand-gtc.gputechconf.com\/gtcnew\/speakerName.php?speaker=Uri+Tal."},{"key":"e_1_3_2_1_30_1","volume-title":"Electronic Design Automation: Synthesis, Verification, and Test","author":"Wang Laung-Terng","unstructured":"Laung-Terng Wang, Yao-Wen Chang, and Kwang-Ting\u00a0(Tim) Cheng. 2009. Electronic Design Automation: Synthesis, Verification, and Test. Morgan Kaufmann Publishers Inc."},{"key":"e_1_3_2_1_31_1","volume-title":"Opportunities for RTL and Gate Level Simulation using GPUs","author":"Zhang Yanqing","unstructured":"Yanqing Zhang, Haoxing Ren, and Brucek Khailany. 2020. Opportunities for RTL and Gate Level Simulation using GPUs. In IEEE\/ACM ICCAD. 1\u20135."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530601"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/1970353.1970362"}],"event":{"name":"ICPP '22: 51st International Conference on Parallel Processing","location":"Bordeaux France","acronym":"ICPP '22"},"container-title":["Proceedings of the 51st International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545091","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3545008.3545091","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3545008.3545091","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:02:44Z","timestamp":1750186964000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3545008.3545091"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,8,29]]},"references-count":33,"alternative-id":["10.1145\/3545008.3545091","10.1145\/3545008"],"URL":"https:\/\/doi.org\/10.1145\/3545008.3545091","relation":{},"subject":[],"published":{"date-parts":[[2022,8,29]]},"assertion":[{"value":"2023-01-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}