{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T01:08:22Z","timestamp":1773277702062,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":62,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,3,25]],"date-time":"2023-03-25T00:00:00Z","timestamp":1679702400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000185","name":"Defense Advanced Research Projects Agency","doi-asserted-by":"publisher","award":["FA8650-18-2-7856"],"award-info":[{"award-number":["FA8650-18-2-7856"]}],"id":[{"id":"10.13039\/100000185","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100006602","name":"Air Force Research Laboratory","doi-asserted-by":"publisher","award":["FA8650-18-2-7856"],"award-info":[{"award-number":["FA8650-18-2-7856"]}],"id":[{"id":"10.13039\/100006602","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,3,25]]},"DOI":"10.1145\/3582016.3582020","type":"proceedings-article","created":{"date-parts":[[2023,3,20]],"date-time":"2023-03-20T16:59:03Z","timestamp":1679331543000},"page":"46-58","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Beyond Static Parallel Loops: Supporting Dynamic Task Parallelism on Manycore Architectures with Software-Managed Scratchpad Memories"],"prefix":"10.1145","author":[{"given":"Lin","family":"Cheng","sequence":"first","affiliation":[{"name":"Cornell University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Max","family":"Ruttenberg","sequence":"additional","affiliation":[{"name":"University of Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dai Cheol","family":"Jung","sequence":"additional","affiliation":[{"name":"University of Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dustin","family":"Richmond","sequence":"additional","affiliation":[{"name":"University of California at Santa Cruz, Santa Cruz, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Michael","family":"Taylor","sequence":"additional","affiliation":[{"name":"University of Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mark","family":"Oskin","sequence":"additional","affiliation":[{"name":"University of Washington, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Christopher","family":"Batten","sequence":"additional","affiliation":[{"name":"Cornell University, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2023,3,25]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Celerity: An Open-Source RISC-V Tiered Accelerator Fabric. Symp. on High Performance Chips (Hot Chips), Aug.","author":"Ajayi Tutu","year":"2017","unstructured":"Tutu Ajayi , Khalid Al-Hawaj , Aporva Amarnath , Steve Dai , Scott Davidson , Paul Gao , Gai Liu , Atieh Lotfi , Julian Puscar , Anuj Rao , Austin Rovinski , Loai Salem , Ningxiao Sun , Christopher Torng , Luis Vega , Bandhav Veluri , Xiaoyang Wang , Shaolin Xie , Chun Zhao , Ritchie Zhao , Christopher Batten , Ronald G. Dreslinski , Ian Galton , Rajesh K. Gupta , Patrick P. Mercier , Mani Srivastava , Michael B. Taylor , and Zhiru Zhang . 2017 . Celerity: An Open-Source RISC-V Tiered Accelerator Fabric. Symp. on High Performance Chips (Hot Chips), Aug. Tutu Ajayi, Khalid Al-Hawaj, Aporva Amarnath, Steve Dai, Scott Davidson, Paul Gao, Gai Liu, Atieh Lotfi, Julian Puscar, Anuj Rao, Austin Rovinski, Loai Salem, Ningxiao Sun, Christopher Torng, Luis Vega, Bandhav Veluri, Xiaoyang Wang, Shaolin Xie, Chun Zhao, Ritchie Zhao, Christopher Batten, Ronald G. Dreslinski, Ian Galton, Rajesh K. Gupta, Patrick P. Mercier, Mani Srivastava, Michael B. Taylor, and Zhiru Zhang. 2017. Celerity: An Open-Source RISC-V Tiered Accelerator Fabric. Symp. on High Performance Chips (Hot Chips), Aug."},{"key":"e_1_3_2_1_2_1","volume-title":"Workshop on Computer Architecture Research with RISC-V (CARRV), Oct.","author":"Ajayi Tutu","year":"2017","unstructured":"Tutu Ajayi , Khalid Al-Hawaj , Aporva Amarnath , Steve Dai , Scott Davidson , Paul Gao , Gai Liu , Anuj Rao , Austin Rovinski , Ningxiao Sun , Christopher Torng , Luis Vega , Bandhav Veluri , Shaolin Xie , Chun Zhao , Ritchie Zhao , Christopher Batten , Ronald G. Dreslinski , Rajesh K. Gupta , Michael B. Taylor , and Zhiru Zhang . 2017 . Experiences Using the RISC-V Ecosystem to Design an Accelerator-Centric SoC in TSMC 16nm . Workshop on Computer Architecture Research with RISC-V (CARRV), Oct. Tutu Ajayi, Khalid Al-Hawaj, Aporva Amarnath, Steve Dai, Scott Davidson, Paul Gao, Gai Liu, Anuj Rao, Austin Rovinski, Ningxiao Sun, Christopher Torng, Luis Vega, Bandhav Veluri, Shaolin Xie, Chun Zhao, Ritchie Zhao, Christopher Batten, Ronald G. Dreslinski, Rajesh K. Gupta, Michael B. Taylor, and Zhiru Zhang. 2017. Experiences Using the RISC-V Ecosystem to Design an Accelerator-Centric SoC in TSMC 16nm. Workshop on Computer Architecture Research with RISC-V (CARRV), Oct."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.26"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/509593.509632"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2008.105"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2008.4523070"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/237502.237574"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/209937.209958"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1006\/jpdc.1996.0107"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/324133.324234"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2638459"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00041"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/1103845.1094852"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00014"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2021.3103825"},{"key":"e_1_3_2_1_16_1","unstructured":"1993. CRAY T3D System Architecture Overview. http:\/\/www.bitsavers.org\/pdf\/cray\/HR-04033_CRAY_T3D_System_Architecture_Overview_Sep93.pdf \t\t\t\t  1993. CRAY T3D System Architecture Overview. http:\/\/www.bitsavers.org\/pdf\/cray\/HR-04033_CRAY_T3D_System_Architecture_Overview_Sep93.pdf"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2133806.2133822"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.022071133"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/1654059.1654113"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/277652.277725"},{"key":"e_1_3_2_1_21_1","volume-title":"ThunderX3\u2019s Cloudburst of Threads: Marvell Previews 96-core 384-thread Arm Server Processor. Microprocessor Report","author":"Halfhill Tom R.","unstructured":"Tom R. Halfhill . 2020. ThunderX3\u2019s Cloudburst of Threads: Marvell Previews 96-core 384-thread Arm Server Processor. Microprocessor Report , The Linley Group , Apr . Tom R. Halfhill. 2020. ThunderX3\u2019s Cloudburst of Threads: Marvell Previews 96-core 384-thread Arm Server Processor. Microprocessor Report, The Linley Group, Apr."},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2007.4378783"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2010.5434077"},{"key":"e_1_3_2_1_24_1","unstructured":"2012. Intel Cilk Plus Language Extension Specification. https:\/\/www.open-std.org\/jtc1\/sc22\/wg14\/www\/docs\/n1665.htm \t\t\t\t  2012. Intel Cilk Plus Language Extension Specification. https:\/\/www.open-std.org\/jtc1\/sc22\/wg14\/www\/docs\/n1665.htm"},{"key":"e_1_3_2_1_25_1","unstructured":"2019. Intel Threading Building Blocks. https:\/\/software.intel.com\/en-us\/intel-tbb \t\t\t\t  2019. Intel Threading Building Blocks. https:\/\/software.intel.com\/en-us\/intel-tbb"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS50636.2020.9241586"},{"key":"e_1_3_2_1_27_1","unstructured":"2022 (accessed Aug 2022). Kalray MPPA Products. Online Webpage.  https:\/\/www.kalrayinc.com\/products\/mppa-technology\/ \t\t\t\t  2022 (accessed Aug 2022). Kalray MPPA Products. Online Webpage.  https:\/\/www.kalrayinc.com\/products\/mppa-technology\/"},{"key":"e_1_3_2_1_28_1","unstructured":"David Kanter. 2015. Knights Landing Reshapes HPC. \t\t\t\t  David Kanter. 2015. Knights Landing Reshapes HPC."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555774"},{"key":"e_1_3_2_1_30_1","unstructured":"2011. OpenCL Specification v1.2. http:\/\/www.khronos.org\/registry\/cl\/specs\/opencl-1.2.pdf \t\t\t\t  2011. OpenCL Specification v1.2. http:\/\/www.khronos.org\/registry\/cl\/specs\/opencl-1.2.pdf"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/1629911.1630048"},{"key":"e_1_3_2_1_32_1","volume-title":"Int\u2019l Conf. on Cluster Computing, Sep, https:\/\/doi.org\/10","author":"Li L.","year":"1903","unstructured":"L. Li , J. Fang , H. Fu , J. Jiang , W. Zhao , C. He , X. You , and G. Yang . 2018. swCaffe: A Parallel Framework for Accelerating Deep Learning Applications on Sunway TaihuLight . Int\u2019l Conf. on Cluster Computing, Sep, https:\/\/doi.org\/10 .48550\/arXiv. 1903 .06934 10.48550\/arXiv.1903.06934 L. Li, J. Fang, H. Fu, J. Jiang, W. Zhao, C. He, X. You, and G. Yang. 2018. swCaffe: A Parallel Framework for Accelerating Deep Learning Applications on Sunway TaihuLight. Int\u2019l Conf. on Cluster Computing, Sep, https:\/\/doi.org\/10.48550\/arXiv.1903.06934"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.2973991"},{"key":"e_1_3_2_1_34_1","volume-title":"Scalable Hardware Support for Conditional Parallelization. Int\u2019l Conf. on Parallel Architectures and Compilation Techniques (PACT), Sep.","author":"Li Zheng","year":"2010","unstructured":"Zheng Li , Jose Duato , Olivier Certner , and Olivier Temam . 2010 . Scalable Hardware Support for Conditional Parallelization. Int\u2019l Conf. on Parallel Architectures and Compilation Techniques (PACT), Sep. Zheng Li, Jose Duato, Olivier Certner, and Olivier Temam. 2010. Scalable Hardware Support for Conditional Parallelization. Int\u2019l Conf. on Parallel Architectures and Compilation Techniques (PACT), Sep."},{"key":"e_1_3_2_1_35_1","volume-title":"Myong Hyon Cho, Ilia Lebedev, and Srinivas Devadas.","author":"Lis Mieszko","year":"2013","unstructured":"Mieszko Lis , Keun Sup Shim , Myong Hyon Cho, Ilia Lebedev, and Srinivas Devadas. 2013 . Hardware-Level Thread Migration in a 110-Core Shared-Memory Multiprocessor. MIT CSAIL CSG. Mieszko Lis, Keun Sup Shim, Myong Hyon Cho, Ilia Lebedev, and Srinivas Devadas. 2013. Hardware-Level Thread Migration in a 110-Core Shared-Memory Multiprocessor. MIT CSAIL CSG."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.3724\/SP.J.1016.2008.01975"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00028"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Michael McCool Arch D. Robinson and James Reinders. 2012. Structured Parallel Programming: Patterns for Efficient Computation. Morgan Kaufmann. \t\t\t\t  Michael McCool Arch D. Robinson and James Reinders. 2012. Structured Parallel Programming: Patterns for Efficient Computation. Morgan Kaufmann.","DOI":"10.1016\/B978-0-12-415993-8.00003-7"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.36"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-72521-3_18"},{"key":"e_1_3_2_1_41_1","volume-title":"Epiphany-V: A 1024-processor 64-bit RISC System-On-Chip. Computing Research Repository (CoRR), arXiv:abs\/1610.01832","author":"Olofsson Andreas","year":"2016","unstructured":"Andreas Olofsson . 2016. Epiphany-V: A 1024-processor 64-bit RISC System-On-Chip. Computing Research Repository (CoRR), arXiv:abs\/1610.01832 ( 2016 ), Aug, https:\/\/doi.org\/10.48550\/arXiv.1610.01832 10.48550\/arXiv.1610.01832 Andreas Olofsson. 2016. Epiphany-V: A 1024-processor 64-bit RISC System-On-Chip. Computing Research Repository (CoRR), arXiv:abs\/1610.01832 (2016), Aug, https:\/\/doi.org\/10.48550\/arXiv.1610.01832"},{"key":"e_1_3_2_1_42_1","unstructured":"2013. OpenMP Application Program Interface Version 4.0. http:\/\/www.openmp.org\/mp-documents\/OpenMP4.0.0.pdf \t\t\t\t  2013. OpenMP Application Program Interface Version 4.0. http:\/\/www.openmp.org\/mp-documents\/OpenMP4.0.0.pdf"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2014.6853209"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS50636.2020.9241710"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/SBAC-PAD.2007.36"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2011.7477491"},{"key":"e_1_3_2_1_47_1","unstructured":"James Reinders. 2007. Intel Threading Building Blocks: Outfitting C++ for Multi-core Processor Parallelism. O\u2019Reilly. \t\t\t\t  James Reinders. 2007. Intel Threading Building Blocks: Outfitting C++ for Multi-core Processor Parallelism. O\u2019Reilly."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.23919\/VLSIC.2019.8778031"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/LSSC.2019.2953847"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/2038037.1941582"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3155284.3018758"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517327.2442530"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2814602"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/1345206.1345255"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2003.1234253"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC19947.2020.9062927"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/945445.945471"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00025"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2007.4378780"},{"key":"e_1_3_2_1_60_1","volume-title":"Ampere Maxes Out at 128 Cores. Microprocessor Report","author":"Wheeler Bob","unstructured":"Bob Wheeler . 2020. Ampere Maxes Out at 128 Cores. Microprocessor Report , The Linley Group , Jul . Bob Wheeler. 2020. Ampere Maxes Out at 128 Cores. Microprocessor Report, The Linley Group, Jul."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/2990509.2990510"},{"key":"e_1_3_2_1_62_1","volume-title":"Manticore: A 4096-Core RISC-V Chiplet Architecture for Ultraefficient Floating-Point Computing","author":"Zaruba Florian","year":"2021","unstructured":"Florian Zaruba , Fabian Schuiki , and Luca Benini . 2021 . Manticore: A 4096-Core RISC-V Chiplet Architecture for Ultraefficient Floating-Point Computing . IEEE Micro , Mar\/ Apr , https:\/\/doi.org\/10.48550\/arXiv.2008.06502 10.48550\/arXiv.2008.06502 Florian Zaruba, Fabian Schuiki, and Luca Benini. 2021. Manticore: A 4096-Core RISC-V Chiplet Architecture for Ultraefficient Floating-Point Computing. IEEE Micro, Mar\/Apr, https:\/\/doi.org\/10.48550\/arXiv.2008.06502"}],"event":{"name":"ASPLOS '23: 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3","location":"Vancouver BC Canada","acronym":"ASPLOS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3582016.3582020","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:44Z","timestamp":1750178804000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3582016.3582020"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,25]]},"references-count":62,"alternative-id":["10.1145\/3582016.3582020","10.1145\/3582016"],"URL":"https:\/\/doi.org\/10.1145\/3582016.3582020","relation":{},"subject":[],"published":{"date-parts":[[2023,3,25]]},"assertion":[{"value":"2023-03-25","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}