{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T02:36:30Z","timestamp":1764815790133,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":28,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,6,23]],"date-time":"2024-06-23T00:00:00Z","timestamp":1719100800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"National Science and Technology Major Project","award":["2022ZD0116800"],"award-info":[{"award-number":["2022ZD0116800"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,6,23]]},"DOI":"10.1145\/3649329.3658462","type":"proceedings-article","created":{"date-parts":[[2024,11,7]],"date-time":"2024-11-07T19:27:22Z","timestamp":1731007642000},"page":"1-6","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Control Flow Divergence Optimization by Exploiting Tensor Cores"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0208-4677","authenticated-orcid":false,"given":"Weiguang","family":"Pang","sequence":"first","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"},{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"},{"name":"Shandong Provincial Key Laboratory of Computer Networks, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2675-2895","authenticated-orcid":false,"given":"Xu","family":"Jiang","sequence":"additional","affiliation":[{"name":"University of Electronic Science and Technology of China, Chengdu, Sichuan, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9234-5799","authenticated-orcid":false,"given":"Songran","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science and Engineering, Northeastern Univeristy, Shenyang, Liaoning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2637-9683","authenticated-orcid":false,"given":"Lei","family":"Qiao","sequence":"additional","affiliation":[{"name":"Beijing Institute of Control Engineering, Beijing, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1204-0942","authenticated-orcid":false,"given":"Kexue","family":"Fu","sequence":"additional","affiliation":[{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"},{"name":"Shandong Provincial Key Laboratory of Computer Networks, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3026-7537","authenticated-orcid":false,"given":"Longxiang","family":"Gao","sequence":"additional","affiliation":[{"name":"Key Laboratory of Computing Power Network and Information Security, Ministry of Education, Shandong Computer Science Center (National Supercomputer Center in Jinan), Qilu University of Technology (Shandong Academy of Sciences), Jinan, Shandong, China"},{"name":"Shandong Provincial Key Laboratory of Computer Networks, Shandong Fundamental Research Center for Computer Science, Jinan, Shandong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2994-6110","authenticated-orcid":false,"given":"Wang","family":"Yi","sequence":"additional","affiliation":[{"name":"Uppsala University, Uppsala, Uppsala, Sweden"},{"name":"School of Computer Science and Engineering, Northeastern Univeristy, Shenyang, Liaoning, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,11,7]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"crossref","unstructured":"Sara S Baghsorkhi and Matthieu Delahaye. 2010. An adaptive performance modeling tool for GPU architectures. In ACM SIGPLAN. 105--114.","DOI":"10.1145\/1693453.1693470"},{"key":"e_1_3_2_1_2_1","volume-title":"Rodinia: A benchmark suite for heterogeneous computing. In 2009 IEEE IISWC. Ieee, 44--54.","author":"Che Shuai","year":"2009","unstructured":"Shuai Che and Michael Boyer. 2009. Rodinia: A benchmark suite for heterogeneous computing. In 2009 IEEE IISWC. Ieee, 44--54."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071134"},{"volume-title":"An accurate GPU performance model for effective control flow divergence optimization. In 2012 IEEE PDPS","author":"Cui Zheng","key":"e_1_3_2_1_4_1","unstructured":"Zheng Cui. 2012. An accurate GPU performance model for effective control flow divergence optimization. In 2012 IEEE PDPS. IEEE, 83--94."},{"key":"e_1_3_2_1_5_1","first-page":"12","article-title":"Two-level branch prediction using neural networks","volume":"49","author":"Egan Colin","year":"2003","unstructured":"Colin Egan. 2003. Two-level branch prediction using neural networks. JSA 49, 12-15 (2003), 557--570.","journal-title":"JSA"},{"volume-title":"CUDA application design and development","author":"Farber Rob","key":"e_1_3_2_1_6_1","unstructured":"Rob Farber. 2011. CUDA application design and development. Elsevier."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1543753.1543756"},{"volume-title":"Thread block compaction for efficient SIMT control flow. In 2011 IEEE HPCA","author":"Fung Wilson WL","key":"e_1_3_2_1_9_1","unstructured":"Wilson WL Fung. 2011. Thread block compaction for efficient SIMT control flow. In 2011 IEEE HPCA. IEEE, 25--36."},{"volume-title":"Accel-Sim: An extensible simulation framework for validated GPU modeling","author":"Khairy Mahmoud","key":"e_1_3_2_1_10_1","unstructured":"Mahmoud Khairy. 2020. Accel-Sim: An extensible simulation framework for validated GPU modeling. In ACM\/IEEE ISCA. IEEE."},{"key":"e_1_3_2_1_11_1","first-page":"1165","article-title":"An accurate GPU performance model for effective control flow divergence optimization","volume":"35","author":"Liang Yun","year":"2015","unstructured":"Yun Liang. 2015. An accurate GPU performance model for effective control flow divergence optimization. IEEE TCAD 35, 7 (2015), 1165--1178.","journal-title":"IEEE TCAD"},{"key":"e_1_3_2_1_12_1","unstructured":"Huanxin Lin. [n. d.]. On-GPU thread-data remapping for nested branch divergence. J. Parallel and Distrib. Comput. ([n. d.])."},{"key":"e_1_3_2_1_13_1","first-page":"1","article-title":"On-GPU thread-data remapping for branch divergence reduction","volume":"15","author":"Lin Huanxin","year":"2018","unstructured":"Huanxin Lin. 2018. On-GPU thread-data remapping for branch divergence reduction. ACM TACO 15, 3 (2018), 1--24.","journal-title":"ACM TACO"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3017196"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815992"},{"key":"e_1_3_2_1_16_1","volume-title":"Accelerating sparse deep neural networks. arXiv preprint arXiv:2104.08378","author":"Mishra Asit","year":"2021","unstructured":"Asit Mishra. 2021. Accelerating sparse deep neural networks. arXiv preprint arXiv:2104.08378 (2021)."},{"key":"e_1_3_2_1_17_1","unstructured":"NVIDIA. 2022. Warp Matrix Multiply-Accumulate (WMMA). http:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html"},{"volume-title":"Dissecting the CUDA scheduling hierarchy: a performance and predictability perspective. In 2020 RTAS","author":"Olmedo Ignacio Sa\u00f1udo","key":"e_1_3_2_1_18_1","unstructured":"Ignacio Sa\u00f1udo Olmedo. 2020. Dissecting the CUDA scheduling hierarchy: a performance and predictability perspective. In 2020 RTAS. IEEE, 213--225."},{"key":"e_1_3_2_1_19_1","first-page":"102888","article-title":"Efficient CUDA stream management for multi-DNN real-time inference on embedded GPUs","volume":"139","author":"Pang Weiguang","year":"2023","unstructured":"Weiguang Pang. 2023. Efficient CUDA stream management for multi-DNN real-time inference on embedded GPUs. JSA 139 (2023), 102888.","journal-title":"JSA"},{"key":"e_1_3_2_1_20_1","first-page":"2849","article-title":"Towards the Predictability of Dynamic Real-Time DNN Inference","volume":"41","author":"Pang Weiguang","year":"2021","unstructured":"Weiguang Pang and Xu Jiang. 2021. Towards the Predictability of Dynamic Real-Time DNN Inference. IEEE TCAD 41, 9 (2021), 2849--2862.","journal-title":"IEEE TCAD"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-12239-2_46"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1089\/cmb.2009.0062"},{"key":"e_1_3_2_1_23_1","first-page":"27","article-title":"Parboil: A revised benchmark suite for scientific and commercial throughput computing","volume":"127","author":"Stratton John A","year":"2012","unstructured":"John A Stratton. 2012. Parboil: A revised benchmark suite for scientific and commercial throughput computing. Center for Reliable and High-Performance Computing 127 (2012), 27.","journal-title":"Center for Reliable and High-Performance Computing"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3217824"},{"key":"e_1_3_2_1_25_1","unstructured":"David Tarjan. [n. d.]. Merging path and gshare indexing in perceptron branch prediction. ACM TACO ([n. d.])."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/1810085.1810104"},{"volume-title":"Exploiting intra-sm parallelism in gpus via persistent and elastic blocks. In 2021 ICCD","author":"Zhao Han","key":"e_1_3_2_1_27_1","unstructured":"Han Zhao. 2021. Exploiting intra-sm parallelism in gpus via persistent and elastic blocks. In 2021 ICCD. IEEE, 290--298."},{"key":"e_1_3_2_1_28_1","volume-title":"Tacker: Tensor-CUDA Core Kernel Fusion for Improving the GPU Utilization while Ensuring QoS. In 2022","author":"Zhao Han","year":"2022","unstructured":"Han Zhao. 2022. Tacker: Tensor-CUDA Core Kernel Fusion for Improving the GPU Utilization while Ensuring QoS. In 2022 IEEE HPCA. IEEE, 800--813."}],"event":{"name":"DAC '24: 61st ACM\/IEEE Design Automation Conference","sponsor":["SIGDA ACM Special Interest Group on Design Automation","IEEE-CEDA","SIGBED ACM Special Interest Group on Embedded Systems"],"location":"San Francisco CA USA","acronym":"DAC '24"},"container-title":["Proceedings of the 61st ACM\/IEEE Design Automation Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658462","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3649329.3658462","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T01:18:01Z","timestamp":1750295881000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3649329.3658462"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,6,23]]},"references-count":28,"alternative-id":["10.1145\/3649329.3658462","10.1145\/3649329"],"URL":"https:\/\/doi.org\/10.1145\/3649329.3658462","relation":{},"subject":[],"published":{"date-parts":[[2024,6,23]]},"assertion":[{"value":"2024-11-07","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}