{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,1]],"date-time":"2025-10-01T15:47:08Z","timestamp":1759333628832,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":75,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,1,18]],"date-time":"2024-01-18T00:00:00Z","timestamp":1705536000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CCF-2349141,CCF-2349582,OAC-2349143,TI-2349144"],"award-info":[{"award-number":["CCF-2349141,CCF-2349582,OAC-2349143,TI-2349144"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,1,18]]},"DOI":"10.1145\/3635035.3635037","type":"proceedings-article","created":{"date-parts":[[2024,1,20]],"date-time":"2024-01-20T00:23:32Z","timestamp":1705710212000},"page":"95-106","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["An Efficient Task-Parallel Pipeline Programming Framework"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0406-885X","authenticated-orcid":false,"given":"Cheng-Hsiang","family":"Chiu","sequence":"first","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Wisconsin-Madison, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5641-9842","authenticated-orcid":false,"given":"Zhicheng","family":"Xiong","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, Tsinghua University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0724-5356","authenticated-orcid":false,"given":"Zizheng","family":"Guo","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9768-3378","authenticated-orcid":false,"given":"Tsung-Wei","family":"Huang","sequence":"additional","affiliation":[{"name":"Department of Electrical and Computer Engineering, University of Wisconsin-Madison, United States of America"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0977-2774","authenticated-orcid":false,"given":"Yibo","family":"Lin","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, China"}]}],"member":"320","published-online":{"date-parts":[[2024,1,19]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[1] Intel oneTBB. https:\/\/github.com\/oneapi-src\/oneTBB"},{"key":"e_1_3_2_1_2_1","unstructured":"[2] OpenMP. https:\/\/www.openmp.org\/"},{"key":"e_1_3_2_1_3_1","unstructured":"[3] TAU 2018 Contest. https:\/\/sites.google.com\/view\/taucontest2018\/home"},{"volume-title":"FastFlow: High-Level and Efficient Streaming on Multicore","author":"Aldinucci Marco","key":"e_1_3_2_1_4_1","unstructured":"Marco Aldinucci, Marco Danelutto, Peter Kilpatrick, and Massimo Torquati. 2017. FastFlow: High-Level and Efficient Streaming on Multicore. John Wiley and Sons, Ltd, Chapter\u00a013, 261\u2013280."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/1454115.1454128"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/209936.209958"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/ESPM251964.2020.00011"},{"volume-title":"uSAP: An Ultra-Fast Stochastic Graph Partitioner","author":"Chang Chih-Chun","key":"e_1_3_2_1_8_1","unstructured":"Chih-Chun Chang and Tsung-Wei Huang. 2023. uSAP: An Ultra-Fast Stochastic Graph Partitioner. In IEEE HPEC. 1\u20137."},{"key":"e_1_3_2_1_9_1","volume-title":"Composing Pipeline Parallelism Using Control Taskflow Graph. In ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC). 283\u2013284","author":"Chiu Cheng-Hsiang","year":"2022","unstructured":"Cheng-Hsiang Chiu and Tsung-Wei Huang. 2022. Composing Pipeline Parallelism Using Control Taskflow Graph. In ACM International Symposium on High-Performance Parallel and Distributed Computing (HPDC). 283\u2013284."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3489517.3530616"},{"key":"e_1_3_2_1_11_1","volume-title":"An Experimental Study of SYCL Task Graph Parallelism for Large-Scale Machine Learning Workloads. In European Conference on Parallel Processing (Euro-Par). 468\u2013479","author":"Chiu Cheng-Hsiang","year":"2022","unstructured":"Cheng-Hsiang Chiu, Dian-Lun Lin, and Tsung-Wei Huang. 2022. An Experimental Study of SYCL Task Graph Parallelism for Large-Scale Machine Learning Workloads. In European Conference on Parallel Processing (Euro-Par). 468\u2013479."},{"key":"e_1_3_2_1_12_1","volume-title":"Programming Dynamic Task Parallelism for Heterogeneous EDA Algorithms. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD).","author":"Chiu Cheng-Hsiang","year":"2023","unstructured":"Cheng-Hsiang Chiu, Dian-Lun Lin, and Tsung-Wei Huang. 2023. Programming Dynamic Task Parallelism for Heterogeneous EDA Algorithms. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"David del Rio\u00a0Astorga Manuel\u00a0F. Dolz Javier Fernandez and J.\u00a0Daniel Garcia. 2017. A Generic Parallel Pattern Interface for Stream and Data Processing. In Concurrency and Computation: Practice and Experience.","DOI":"10.1002\/cpe.4175"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2168836.2168873"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"crossref","unstructured":"Dalvan Griebler Marco Danelutto Massimo Torquati and Luiz\u00a0Gustavo Fernandes. 2017. SPar: A DSL for High-Level and Productive Stream Parallelism. In Parallel Processing Letters.","DOI":"10.1142\/S0129626417400059"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Dalvan Griebler Renato\u00a0B. Hoffmann Marco Danelutto and Luiz\u00a0G. Fernandes. 2019. High-Level and Productive Stream Parallelism for Dedup Ferret and Bzip2. In The Journal of Parallel Programming. 253\u2013271.","DOI":"10.1007\/s10766-018-0558-x"},{"key":"e_1_3_2_1_17_1","volume-title":"An Efficient Critical Path Generation Algorithm Considering Extensive Path Constraints. In ACM\/IEEE Degign Automation Conference (DAC). 1\u20136.","author":"Guo Guannan","year":"2020","unstructured":"Guannan Guo, Tsung-Wei Huang, Chun-Xun Lin, and Martin Wong. 2020. An Efficient Critical Path Generation Algorithm Considering Extensive Path Constraints. In ACM\/IEEE Degign Automation Conference (DAC). 1\u20136."},{"key":"e_1_3_2_1_18_1","volume-title":"GPU-accelerated Critical Path Generation with Path Constraints. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 1\u20139.","author":"Guo Guannan","year":"2021","unstructured":"Guannan Guo, Tsung-Wei Huang, Yibo Lin, and Martin Wong. 2021. GPU-accelerated Critical Path Generation with Path Constraints. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 1\u20139."},{"key":"e_1_3_2_1_19_1","volume-title":"GPU-Accelerated Static Timing Analysis. In IEEE\/ACM International Conference On Computer Aided Design (ICCAD).","author":"Guo Zizheng","year":"2020","unstructured":"Zizheng Guo, Tsung-Wei Huang, and Yibo Lin. 2020. GPU-Accelerated Static Timing Analysis. In IEEE\/ACM International Conference On Computer Aided Design (ICCAD)."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18074.2021.9586085"},{"key":"e_1_3_2_1_21_1","volume-title":"HeteroCPPR: Accelerating Common Path Pessimism Removal with Heterogeneous CPU-GPU Parallelism. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 1\u20139.","author":"Guo Zizheng","year":"2021","unstructured":"Zizheng Guo, Tsung-Wei Huang, and Yibo Lin. 2021. HeteroCPPR: Accelerating Common Path Pessimism Removal with Heterogeneous CPU-GPU Parallelism. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 1\u20139."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2023.3286261"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"crossref","unstructured":"Ralf Hoffmann Matthias Korch and Thomas Rauber. 2004. Performance Evaluation of Task Pools Based on Hardware Synchronization. In ACM Supercomputing. 44\u201344.","DOI":"10.1109\/SC.2004.38"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"crossref","unstructured":"Renato\u00a0B. Hoffmann Junior Loff Dalvan Griebler and Luiz\u00a0G. Fernandes. 2022. OpenMP as Runtime for Providing High-Level Stream Parallelism on Multi-Cores. In The Journal of Supercomputing. 7655\u20137676.","DOI":"10.1007\/s11227-021-04182-9"},{"key":"e_1_3_2_1_25_1","volume-title":"Enhancing the Performance Portability of Heterogeneous Circuit Analysis Programs. In IEEE High Performance Extreme Computing Conference (HPEC). 1\u20132.","author":"Huang Tsung-Wei","year":"2022","unstructured":"Tsung-Wei Huang. 2022. Enhancing the Performance Portability of Heterogeneous Circuit Analysis Programs. In IEEE High Performance Extreme Computing Conference (HPEC). 1\u20132."},{"volume-title":"OpenTimer v2: A New Parallel Incremental Timing Analysis Engine","author":"Huang Tsung-Wei","key":"e_1_3_2_1_26_1","unstructured":"Tsung-Wei Huang, Guannan Guo, Chun-Xun Lin, and Martin Wong. 2021. OpenTimer v2: A New Parallel Incremental Timing Analysis Engine. In IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems (TCAD). 776\u2013789."},{"key":"e_1_3_2_1_27_1","volume-title":"Task-Parallel Programming with Constrained Parallelism. In IEEE High Performance Extreme Computing Conference (HPEC). 1\u20137.","author":"Huang Tsung-Wei","year":"2022","unstructured":"Tsung-Wei Huang and Leslie Hwang. 2022. Task-Parallel Programming with Constrained Parallelism. In IEEE High Performance Extreme Computing Conference (HPEC). 1\u20137."},{"volume-title":"ACM International Conference on Multimedia (MM). 1360\u20131363","author":"Huang Tsung-Wei","key":"e_1_3_2_1_28_1","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin D.\u00a0F. Wong. 2018. A General-Purpose Distributed Programming System Using Data-Parallel Streams. In ACM International Conference on Multimedia (MM). 1360\u20131363."},{"volume-title":"IEEE International Parallel and Distributed Processing Symposium (IPDPS), 974\u2013983","author":"Huang Tsung-Wei","key":"e_1_3_2_1_29_1","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin D.\u00a0F. Wong. 2019. Cpp-Taskflow: Fast Task-based Parallel Programming using Modern C++. IEEE International Parallel and Distributed Processing Symposium (IPDPS), 974\u2013983."},{"key":"e_1_3_2_1_30_1","volume-title":"\u00a0F. Wong","author":"Huang Tsung-Wei","year":"2019","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, Guannan Guo, and Martin D.\u00a0F. Wong. 2019. Essential Building Blocks for Creating an Open-Source EDA Project. In ACM\/IEEE DAC. Article 78, 4\u00a0pages."},{"volume-title":"IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 757\u2013765","author":"Huang Tsung-Wei","key":"e_1_3_2_1_31_1","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, and Martin D.\u00a0F. Wong. 2017. DtCraft: A distributed execution engine for compute-intensive applications. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 757\u2013765."},{"key":"e_1_3_2_1_32_1","volume-title":"\u00a0F. Wong","author":"Huang Tsung-Wei","year":"2019","unstructured":"Tsung-Wei Huang, Chun-Xun Lin, and Martin D.\u00a0F. Wong. 2019. Distributed Timing Analysis at Scale. In ACM\/IEEE DAC. Article 229."},{"key":"e_1_3_2_1_33_1","volume-title":"Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System","author":"Huang Tsung-Wei","year":"2022","unstructured":"Tsung-Wei Huang, Dian-Lun Lin, Chun-Xun Lin, and Yibo Lin. 2022. Taskflow: A Lightweight Parallel and Heterogeneous Task Graph Computing System. In IEEE Transactions on Parallel and Distributed Systems (TPDS). 1303\u20131320."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2020.3025075"},{"volume-title":"Accelerated Path-Based Timing Analysis with MapReduce. In ACM International Symposium on Physical Design (ISPD). 103\u2013110","author":"Huang Tsung-Wei","key":"e_1_3_2_1_35_1","unstructured":"Tsung-Wei Huang and Martin D.\u00a0F. Wong. 2015. Accelerated Path-Based Timing Analysis with MapReduce. In ACM International Symposium on Physical Design (ISPD). 103\u2013110."},{"volume-title":"ACM\/IEEE International Workshop on System Level Interconnect Prediction (SLIP). 1\u20136.","author":"Huang Tsung-Wei","key":"e_1_3_2_1_36_1","unstructured":"Tsung-Wei Huang and Martin D.\u00a0F. Wong. 2015. On fast timing closure: speeding up incremental path-based timing analysis with mapreduce. In ACM\/IEEE International Workshop on System Level Interconnect Prediction (SLIP). 1\u20136."},{"volume-title":"OpenTimer: A High-Performance Timing Analysis Tool. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 895\u2013902","author":"Huang Tsung-Wei","key":"e_1_3_2_1_37_1","unstructured":"Tsung-Wei Huang and Martin D.\u00a0F. Wong. 2015. OpenTimer: A High-Performance Timing Analysis Tool. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 895\u2013902."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2016.2524566"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897937.2897959"},{"volume-title":"Fast Path-Based Timing Analysis for CPPR. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 596\u2013599","author":"Huang Tsung-Wei","key":"e_1_3_2_1_40_1","unstructured":"Tsung-Wei Huang, Pei-Ci Wu, and Martin D.\u00a0F. Wong. 2014. Fast Path-Based Timing Analysis for CPPR. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 596\u2013599."},{"volume-title":"UI-Timer: An Ultra-Fast Clock Network Pessimism Removal Algorithm. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 758\u2013765","author":"Huang Tsung-Wei","key":"e_1_3_2_1_41_1","unstructured":"Tsung-Wei Huang, Pei-Ci Wu, and Martin D.\u00a0F. Wong. 2014. UI-Timer: An Ultra-Fast Clock Network Pessimism Removal Algorithm. In IEEE\/ACM International Conference on Computer-Aided Design (ICCAD). 758\u2013765."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2834422"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3605573.3605625"},{"key":"e_1_3_2_1_44_1","volume-title":"GLARE: Accelerating Sparse DNN Inference Kernels with Global Memory Access Reduction","author":"Jiang Shiu","year":"2023","unstructured":"Shiu Jiang, Tsung-Wei Huangand, and Tsung-Yi Ho. 2023. GLARE: Accelerating Sparse DNN Inference Kernels with Global Memory Access Reduction. In IEEE HPEC. 1\u20137."},{"key":"e_1_3_2_1_45_1","volume-title":"Reducing Time and Effort in IC Implementation: A Roadmap of Challenges and Solutions. In ACM Design Automation Conference (DAC).","author":"Kahng B.","year":"2018","unstructured":"Andrew\u00a0B. Kahng. 2018. Reducing Time and Effort in IC Implementation: A Roadmap of Challenges and Solutions. In ACM Design Automation Conference (DAC)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2676870.2676883"},{"key":"e_1_3_2_1_47_1","volume-title":"Load-Balanced Pipeline Parallelism. In International Conference on High Performance Computing, Networking, Storage and Analysis (SC). 1\u201312","author":"Kamruzzaman Md","year":"2013","unstructured":"Md Kamruzzaman, Steven Swanson, and Dean\u00a0M. Tullsen. 2013. Load-Balanced Pipeline Parallelism. In International Conference on High Performance Computing, Networking, Storage and Analysis (SC). 1\u201312."},{"volume-title":"A General Cache Framework for Efficient Generation of Timing Critical Paths","author":"Lai Kuan-Ming","key":"e_1_3_2_1_48_1","unstructured":"Kuan-Ming Lai, Tsung-Wei Huang, and Tsung-Yi Ho. 2019. A General Cache Framework for Efficient Generation of Timing Critical Paths. In ACM\/IEEE DAC. Article 108, 6\u00a0pages."},{"volume-title":"LibAbs: An Efficient and Accurate Timing Macro-Modeling Algorithm for Large Hierarchical Designs. In ACM\/IEEE Design Automation Conference (DAC).","author":"Lai Tin-Yin","key":"e_1_3_2_1_49_1","unstructured":"Tin-Yin Lai, Tsung-Wei Huang, and Martin D.\u00a0F. Wong. 2017. LibAbs: An Efficient and Accurate Timing Macro-Modeling Algorithm for Large Hierarchical Designs. In ACM\/IEEE Design Automation Conference (DAC)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"crossref","unstructured":"I-Ting\u00a0Angelina Lee Charles\u00a0E. Leiserson Tao\u00a0B. Schardl Zhunping Zhang and Jim Sukha. 2015. On-the-Fly Pipeline Parallelism. In ACM Transactions on Parallel Computing. 1\u201342.","DOI":"10.1145\/2809808"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"crossref","unstructured":"Daan Leijen Wolfram Schulte and Sebastian Burckhardt. 2009. The Design of a Task Parallel Library. In ACM OOPSLA. 227\u2013241.","DOI":"10.1145\/1640089.1640106"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Charles\u00a0E. Leiserson. 2010. The Cilk++ concurrency platform. In The Journal of Supercomputing. 244\u2013257.","DOI":"10.1007\/s11227-010-0405-3"},{"volume-title":"ACM International Conference on Multimedia (MM). 2284\u20132287","author":"Lin Chun-Xun","key":"e_1_3_2_1_53_1","unstructured":"Chun-Xun Lin, Tsung-Wei Huang, Guannan Guo, and Martin D.\u00a0F. Wong. 2019. A Modern C++ Parallel Task Programming Library. In ACM International Conference on Multimedia (MM). 2284\u20132287."},{"volume-title":"Proceedings of the 2018 on Great Lakes Symposium on VLSI (GLVLSI). 183\u2013188","author":"Lin Chun-Xun","key":"e_1_3_2_1_54_1","unstructured":"Chun-Xun Lin, Tsung-Wei Huang, Ting Yu, and Martin D.\u00a0F. Wong. 2018. A Distributed Power Grid Analysis Framework from Sequential Stream Graph. In Proceedings of the 2018 on Great Lakes Symposium on VLSI (GLVLSI). 183\u2013188."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC43674.2020.9286218"},{"key":"e_1_3_2_1_56_1","volume-title":"Efficient GPU Computation Using Task Graph Parallelism. In European Conference on Parallel Processing (Euro-Par).","author":"Lin Dian-Lun","year":"2021","unstructured":"Dian-Lun Lin and Tsung-Wei Huang. 2021. Efficient GPU Computation Using Task Graph Parallelism. In European Conference on Parallel Processing (Euro-Par)."},{"key":"e_1_3_2_1_57_1","first-page":"3041","article-title":"Accelerating Large Sparse Neural Network Inference Using GPU Task Graph Parallelism","volume":"33","author":"Lin Dian-Lun","year":"2022","unstructured":"Dian-Lun Lin and Tsung-Wei Huang. 2022. Accelerating Large Sparse Neural Network Inference Using GPU Task Graph Parallelism. IEEE IEEE Transactions on Parallel and Distributed Systems (TPDS) 33, 11 (2022), 3041\u20133052.","journal-title":"IEEE IEEE Transactions on Parallel and Distributed Systems (TPDS)"},{"key":"e_1_3_2_1_58_1","volume-title":"International Conference on Parallel Processing (ICPP). 1\u201312","author":"Lin Dian-Lun","year":"2023","unstructured":"Dian-Lun Lin, Haoxing Ren, Yanqing Zhang, Brucek Khailany, and Tsung-Wei Huang. 2023. From RTL to CUDA: A GPU Acceleration Flow for RTL Simulation with Batch Stimulus. In International Conference on Parallel Processing (ICPP). 1\u201312."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247942"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3475061.3475078"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178442.3178443"},{"volume-title":"Communication, Synchronization and Scheduling Algorithms for Efficient and Scalable Loop Pipelining","author":"Mastoras Aristeidis","key":"e_1_3_2_1_62_1","unstructured":"Aristeidis Mastoras and Thomas\u00a0R. Gross. 2018. Unifying Fixed Code Mapping, Communication, Synchronization and Scheduling Algorithms for Efficient and Scalable Loop Pipelining. In IEEE Transactions on Parallel and Distributed Systems (TPDS). 2136\u20132149."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"crossref","unstructured":"Aristeidis Mastoras and Thomas\u00a0R. Gross. 2019. Efficient and Scalable Execution of Fine-Grained Dynamic Linear Pipelines. In ACM Transactions on Architecture and Code Optimization (TACO). 1\u201326.","DOI":"10.1145\/3307411"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"crossref","unstructured":"Aristeidis Mastoras and Thomas\u00a0R. Gross. 2019. Load-balancing for load-imbalanced fine-grained linear pipelines. In Parallel Computing. 2136\u20132149.","DOI":"10.1016\/j.parco.2019.01.002"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582514.3582522"},{"key":"e_1_3_2_1_66_1","volume-title":"Load Balancing Using Wok-Stealing for Pipeline Parallelism in Emerging Applications. In ACM International Conference on Supercomputing (ICS). 517\u2013518","author":"Navarro Angeles","year":"2009","unstructured":"Angeles Navarro, Rafael Asenjo, Siham Tabik, and Calin Cascaval. 2009. Load Balancing Using Wok-Stealing for Pipeline Parallelism in Emerging Applications. In ACM International Conference on Supercomputing (ICS). 517\u2013518."},{"key":"e_1_3_2_1_67_1","volume-title":"Automatic Thread Extraction with Decoupled Software Pipelining. In IEEE\/ACM International Symposium on Microarchitecture (MICRO). 105\u2013118","author":"Ottoni Guilherme","year":"2005","unstructured":"Guilherme Ottoni, Ram Rangan, Adam Stoler, and David\u00a0I. August. 2005. Automatic Thread Extraction with Decoupled Software Pipelining. In IEEE\/ACM International Symposium on Microarchitecture (MICRO). 105\u2013118."},{"key":"e_1_3_2_1_68_1","volume-title":"Parallel-Stage Decoupled Software Pipelining. In IEEE\/ACM international symposium on Code generation and optimization (CGO). 114\u2013123","author":"Raman Easwaran","year":"2008","unstructured":"Easwaran Raman, Guilherme Ottoni, Arun Raman, Matthew\u00a0J. Bridges, and David\u00a0I. August. 2008. Parallel-Stage Decoupled Software Pipelining. In IEEE\/ACM international symposium on Code generation and optimization (CGO). 114\u2013123."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"crossref","unstructured":"Ram Rangan Neil Vachharajani Guilherme Ottoni and David\u00a0I. August. 2008. Performance Scalability of Decoupled Software Pipelining. In ACM Transactions on Architecture and Code Optimization (TACO). 1\u201325.","DOI":"10.1145\/1400112.1400113"},{"key":"e_1_3_2_1_70_1","volume-title":"Dynamic Fine-Grain Schedulig of Pipeline Parallelism. In International Conference on Parallel Architectures and Compilation Techniques (PACT). 22\u201332","author":"Sanchez Daniel","year":"2011","unstructured":"Daniel Sanchez, David Lo, Richard\u00a0M. Yoo, Jeremy Sugerman, and Christos Kozyrakis. 2011. Dynamic Fine-Grain Schedulig of Pipeline Parallelism. In International Conference on Parallel Architectures and Compilation Techniques (PACT). 22\u201332."},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577509"},{"key":"e_1_3_2_1_72_1","volume-title":"Feedback-Directed Pipeline Parallelism. In International Conference on Parallel Architectures and Compilation Techniques (PACT). 147\u2013156","author":"Suleman Aater","year":"2010","unstructured":"M.\u00a0Aater Suleman, Moinuddin\u00a0K. Qureshi, Khubaib, and Yale\u00a0N. Patt. 2010. Feedback-Directed Pipeline Parallelism. In International Conference on Parallel Architectures and Compilation Techniques (PACT). 147\u2013156."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"crossref","unstructured":"Adriano Vogel Dalvan Griebler and Luiz\u00a0Gustavo Fernandes. 2021. Providing high-level self-adaptive abstractions for stream parallelism on multicores. In Journal of Software: Practice and Experience. 1194\u20131217.","DOI":"10.1002\/spe.2948"},{"key":"e_1_3_2_1_74_1","volume-title":"Towards On-the-fly Self-Adaptation of Stream Parallel Patterns. In International Conference on Parallel, Distributed and Network-Based Processing (PDP). 89\u201393","author":"Vogel Adriano","year":"2021","unstructured":"Adriano Vogel, Gabriele Mencagli, Dalvan Griebler, Marco Danelutto, and Luiz\u00a0Gustavo Fernandes. 2021. Towards On-the-fly Self-Adaptation of Stream Parallel Patterns. In International Conference on Parallel, Distributed and Network-Based Processing (PDP). 89\u201393."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPEC49654.2021.9622872"}],"event":{"name":"HPCAsia 2024: International Conference on High Performance Computing in Asia-Pacific Region","acronym":"HPCAsia 2024","location":"Nagoya Japan"},"container-title":["Proceedings of the International Conference on High Performance Computing in Asia-Pacific Region"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3635035.3635037","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3635035.3635037","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T02:19:19Z","timestamp":1755915559000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3635035.3635037"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,1,18]]},"references-count":75,"alternative-id":["10.1145\/3635035.3635037","10.1145\/3635035"],"URL":"https:\/\/doi.org\/10.1145\/3635035.3635037","relation":{},"subject":[],"published":{"date-parts":[[2024,1,18]]},"assertion":[{"value":"2024-01-19","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}