{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T16:44:11Z","timestamp":1776357851683,"version":"3.51.2"},"reference-count":302,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/access.2024.3372990","type":"journal-article","created":{"date-parts":[[2024,3,4]],"date-time":"2024-03-04T19:18:47Z","timestamp":1709579927000},"page":"34354-34377","source":"Crossref","is-referenced-by-count":8,"title":["MIMD Programs Execution Support on SIMD Machines: A Holistic Survey"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-1456-7377","authenticated-orcid":false,"given":"Dheya","family":"Mustafa","sequence":"first","affiliation":[{"name":"Department of Computer Engineering, Faculty of Engineering, The Hashemite University, Zarqa, Jordan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9535-2612","authenticated-orcid":false,"given":"Ruba","family":"Alkhasawneh","sequence":"additional","affiliation":[{"name":"Department of Communication and Computer Engineering, Faculty of Engineering, Al-Ahliyya Amman University, Amman, Jordan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8731-0989","authenticated-orcid":false,"given":"Fadi","family":"Obeidat","sequence":"additional","affiliation":[{"name":"Synopsys Inc., Austin, TX, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6239-3298","authenticated-orcid":false,"given":"Ahmed S.","family":"Shatnawi","sequence":"additional","affiliation":[{"name":"Department of Software Engineering, Jordan University of Science and Technology, Irbid, Jordan"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/234313.234345"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/359327.359336"},{"key":"ref3","volume-title":"3DNOW! Technology Manual","year":"2000"},{"key":"ref4","volume-title":"Neon Programmers\u2019 Guide","year":"2023"},{"key":"ref5","first-page":"998","article-title":"Motorola\u2019s AltiVe technology","volume":"6","author":"Fuller","year":"1998","journal-title":"White Paper"},{"key":"ref6","volume-title":"Intel\u00ae 64 and IA-32 Architectures Software Developer\u2019s Manual Volume 1: Basic Architecture","year":"2016"},{"key":"ref7","volume-title":"Intel\u00ae ledR 64 and IA-32 Architectures Software Developer Manuals Volume 2A: Instruction Set Reference","year":"2016"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.35"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2015.7477467"},{"key":"ref10","first-page":"1","article-title":"Introduction of Fujitsu\u2019s HPC processor for the post-K computer","volume-title":"Proc. Hot Chips 28th Symp.","author":"Yoshida"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1145\/566570.566640"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/1186562.1015800"},{"key":"ref13","volume-title":"Programming Massively Parallel Processors Lecture 12","author":"Shebanow","year":"2007"},{"key":"ref14","volume-title":"Method for conditional branch execution in SIMD vector processors","author":"Lorie","year":"1984"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2005.37"},{"key":"ref16","volume-title":"Method and system for programmable pipelined graphics processing with branching instructions","author":"Moy","year":"2005"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2007.59"},{"key":"ref18","volume-title":"NVIDIA CUDA (Compute Unified Device Architecture) Programming Guide 3.1","year":"2010"},{"key":"ref19","first-page":"47","article-title":"Exploiting recent SIMD architectural advances for irregular applications","volume-title":"Proc. IEEE\/ACM Int. Symp. Code Gener. Optim. (CGO)","author":"Chen"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1016\/j.eng.2020.01.007"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1016\/j.jpdc.2018.11.012"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1111\/j.1467-8659.2007.01012.x"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1007\/bfb0116397"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CMPCON.1990.63649"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/FMPC.1990.89460"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/2.121476"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/IPPS.1995.395959"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/63047.63048"},{"key":"ref29","volume-title":"Computer Architecture a Quantitative Approach","author":"Hennesy","year":"2017"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1145\/218864.218868"},{"key":"ref31","article-title":"Simulating applicative architectures on the connection machine","author":"Kuszmaul","year":"1986"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1145\/7902.7903"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/71.395399"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1093\/comjnl\/30.5.393"},{"key":"ref35","first-page":"1031","article-title":"Massively parallel implementation of flat GHC on the connection machine","volume-title":"Proc. Int. Conf. 5th Generat. Comput. Syst.","author":"Nilsson"},{"key":"ref36","article-title":"Environment and system interface for VM\/EPEX","author":"Darema-Rodgers","year":"1985"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1016\/0167-8191(88)90094-4"},{"key":"ref38","first-page":"65","article-title":"Characterizing parallel algorithms","volume-title":"The Characteristics of Parallel Algorithms","author":"Jamieson","year":"1987"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/76263.76342"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/71.80147"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1016\/0743-7315(91)90048-E"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1981.1675732"},{"key":"ref43","first-page":"387","article-title":"An overview of the PASM parallel processing system","volume-title":"Computer Architecture","author":"Siegel","year":"1987"},{"key":"ref44","volume-title":"Parallel Computing: Theory and Comparisons","author":"Lipovski","year":"1987"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1145\/1500518.1500623"},{"key":"ref46","first-page":"143","article-title":"The OPSILA computer","volume-title":"Parallel Languages and Architectures","author":"Auguin","year":"1986"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1016\/0165-6074(87)90034-2"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR.1988.28259"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2011.5764682"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339601"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2019.2950087"},{"key":"ref52","article-title":"Modern vector architectures for high-performance computing","author":"Poenaru","year":"2022"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4302-0172-4_8"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/FPL53798.2021.00082"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2017.2654506"},{"key":"ref56","volume-title":"Intel AVX-512 Instructions","author":"Reinders","year":"2017"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.3390\/computers11050075"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2007.346199"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2011.5764683"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1145\/3243176.3243192"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1145\/2370036.2145824"},{"key":"ref62","volume-title":"ATI Stream Computing SDK User Guide V1.3-Beta","year":"2008"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1145\/1837853.1693470"},{"key":"ref64","first-page":"1","article-title":"Vector engine processor of NEC\u2019s brand-new supercomputer SX-Aurora TSUBASA","volume-title":"Proc. Int. Symp. High Perform. Chips","author":"Yamada"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2018.00057"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-017-1993-y"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2018.032271057"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.7717\/peerj-cs.909"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567066"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.44"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220591"},{"key":"ref74","volume-title":"BrookGPU Home Page","year":"2023"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2008.4541126"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.930"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1145\/1356058.1356084"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5160988"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/PDP.2015.44"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/1345206.1345220"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/1375527.1375562"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2020.02.069"},{"key":"ref83","first-page":"666","volume-title":"The OpenMP Application Programming Interface","year":"2018"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.4135\/9781483375519.n38"},{"key":"ref85","volume-title":"The OpenCL Specification Version 1.2. Khronos OpenCL Working Group","author":"Munshi","year":"2012"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC-SmartCity-DSS.2017.10"},{"key":"ref87","doi-asserted-by":"publisher","DOI":"10.1177\/1094342019832958"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/MCSoC.2015.10"},{"key":"ref89","volume-title":"HIP","year":"2023"},{"key":"ref90","volume-title":"Hip Programming Guide","year":"2022"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-71593-9_9"},{"key":"ref92","volume-title":"SYCLTM 2020 Specification (Revision 7)","year":"2020"},{"key":"ref93","volume-title":"Data Parallel C++ Mastering DPC++ for Programming of Heterogeneous Systems Using C++ and SYCL","author":"Ashbaugh","year":"2020"},{"key":"ref94","doi-asserted-by":"publisher","DOI":"10.1145\/3388333.3388649"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1201\/9781003393122-2"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.36"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-30961-8_13"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.1145\/1594835.1504194"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1504\/IJCSE.2013.052110"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3147846"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-662-48096-0_35"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1109\/ASAP.2013.6567598"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600228"},{"key":"ref104","doi-asserted-by":"publisher","DOI":"10.1002\/9781119332015.ch13"},{"key":"ref105","volume-title":"The Python Programming Language","author":"Rossum","year":"1994"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2011.09.001"},{"key":"ref107","article-title":"CorePy: High-productivity Cell\/BE programming","volume-title":"Proc. 1st STI\/Georgia Tech Workshop Softw. Appl. Cell\/BE Processor","author":"Mueller"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-03869-3_82"},{"key":"ref109","doi-asserted-by":"publisher","DOI":"10.1145\/1513895.1513902"},{"key":"ref110","doi-asserted-by":"publisher","DOI":"10.1145\/79173.79181"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1145\/1399504.1360618"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/HPCC.2011.73"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1109\/LLVM-HPC.2014.10"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.5555\/977395.977673"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/2632215"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-89740-8_1"},{"key":"ref117","volume-title":"The Portland Group, PGI Fortran and C Accelarator Programming Model","year":"2009"},{"key":"ref118","volume-title":"HMPP Workbench, a Directive-Based Compiler for Hybrid Computing","year":"2009"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735698"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1145\/1995896.1995932"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1145\/2400682.2400713"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/TII.2017.2731362"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2010.106"},{"key":"ref124","first-page":"1","article-title":"Par4All: From convex array regions to heterogeneous computing","volume-title":"Proc. 2nd Int. Workshop Polyhedral Compilation Techn., Impact","author":"Amini"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24595-9_4"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.8014"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2000.898067"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.1998.742118"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1145\/1048935.1050187"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-89740-8_2"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454106"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3458744.3473354"},{"key":"ref133","volume-title":"Execution Of MIMD MIPSEL Assembly Programs Within CUDA\/OpenCL GPUs","author":"Dietz","year":"2012"},{"key":"ref134","volume-title":"CuBLAS User Guide","year":"2016"},{"key":"ref135","volume-title":"CUDA Toolkit Documentation. NVIDIA Developer Zone","year":"2019"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1016\/j.softx.2019.100337"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-444-81784-6.50042-7"},{"key":"ref138","first-page":"320","article-title":"Optimizing the emulation of MIMD behavior on SIMD machines","volume":"96","author":"Sanders","year":"1996","journal-title":"Math. Res."},{"key":"ref139","article-title":"A massively parallel MIMD implemented by SIMD hardware","author":"Dietz","year":"1992"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8191(84)90181-9"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1109\/CMPCON.1990.63648"},{"issue":"1","key":"ref142","first-page":"58","article-title":"MIMD execution by SIMD computers","volume":"13","author":"Nilsson","year":"1990","journal-title":"J. Inf. Process."},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/IPPS.1994.288285"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1145\/62678.62714"},{"key":"ref145","article-title":"Multiple instruction multiple data emulation on the connection machine","author":"Collins","year":"1991"},{"key":"ref146","article-title":"An exploration of asynchronous data-parallelism","author":"Littmari","year":"1988"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/IPPS.1992.222985"},{"key":"ref148","doi-asserted-by":"publisher","DOI":"10.1109\/FMPC.1992.234908"},{"key":"ref149","doi-asserted-by":"publisher","DOI":"10.1109\/PROC.1972.8647"},{"key":"ref150","volume-title":"ATI CTM Guide","year":"2006"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1145\/3173456"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1145\/3301488"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.1145\/964965.808581"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1145\/29873.29875"},{"key":"ref155","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8191(05)80035-3"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088172"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2005.33"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.1023\/A:1023090719310"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.1145\/1375527.1375566"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1145\/1088149.1088174"},{"key":"ref161","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-008-0072-7"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-69303-1_12"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783714"},{"key":"ref164","article-title":"Towards scalar synchronization in SIMT architectures","author":"Ramamurthy","year":"2011"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2007.30"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.1145\/3302516.3307357"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.5821\/dissertation-2117-125844"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750393"},{"key":"ref169","doi-asserted-by":"publisher","DOI":"10.1145\/2555243.2555254"},{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1145\/1543753.1543756"},{"key":"ref171","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2014.6835936"},{"key":"ref172","article-title":"Toward compiler-driven adaptive execution and its application to GPU architectures","author":"Lee","year":"2011"},{"key":"ref173","article-title":"The landscape of parallel computing research: A view from Berkeley","author":"Asanovic","year":"2006"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2008.5222004"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-19861-8_13"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1145\/2464996.2467268"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1145\/1995896.1995900"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1145\/1950365.1950408"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/SHPCC.1992.232682"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/FMPC.1990.89495"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1145\/143095.143133"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.1016\/0743-7315(90)90088-7"},{"key":"ref183","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-384988-5.00006-1"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-540-77220-0_21"},{"key":"ref185","first-page":"52","article-title":"An effective GPU implementation of breadth-first search","volume-title":"Proc. 47th Design Autom. Conf.","author":"Luo"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145832"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442531"},{"key":"ref188","first-page":"29","article-title":"Task management for irregular-parallel workloads on the GPU","volume-title":"Proc. Conf. High Perform. Graph.","author":"Tzeng"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145831"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2009.5161039"},{"key":"ref191","doi-asserted-by":"publisher","DOI":"10.1145\/996841.996853"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1145\/1133981.1133997"},{"key":"ref193","first-page":"01","article-title":"Temporal vectorization for stencils","volume-title":"Proc. Int. Conf. High Perform. Comput., Netw., Storage Anal.","author":"Yuan"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462187"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1145\/3533737.3535089"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2014.11"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1145\/2304576.2304619"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-010-0142-5"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2010.2"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1145\/2442516.2442523"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1145\/3168827"},{"key":"ref202","article-title":"Comparative performance analysis of Intel Xeon Phi, GPU, and CPU","author":"Teodoro","year":"2013","journal-title":"arXiv:1311.0378"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.2013.87"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.14778\/2735703.2735704"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1145\/2568088.2576799"},{"key":"ref206","volume-title":"The OpenMP API Specification for Parallel Programming","year":"2021"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-13374-9_5"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370324"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2012.292"},{"key":"ref210","first-page":"878","article-title":"MPI: A message passing interface","volume-title":"Proc. Supercomputing"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2007.7482492"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1145\/3368304"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783720"},{"key":"ref214","doi-asserted-by":"publisher","DOI":"10.1145\/2644865.2541967"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/ISSCC.2016.7418007"},{"key":"ref216","doi-asserted-by":"publisher","DOI":"10.1145\/3093336.3037702"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00060"},{"key":"ref218","doi-asserted-by":"publisher","DOI":"10.1145\/1980462.1980484"},{"key":"ref219","doi-asserted-by":"publisher","DOI":"10.1145\/2716282.2716286"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1145\/2544137.2544139"},{"key":"ref221","article-title":"PyTorch-Direct: Enabling GPU centric data access for very large graph neural network training with irregular accesses","author":"Min","year":"2021","journal-title":"arXiv:2101.07956"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1145\/3399730"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW55747.2022.00097"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.1145\/1815961.1815992"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522352"},{"key":"ref226","first-page":"477","article-title":"SIMD re-convergence at thread frontiers","volume-title":"Proc. 44th Annu. IEEE\/ACM Int. Symp. Microarchitecture (MICRO)","author":"Diamos"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2011.5749714"},{"key":"ref228","first-page":"308","article-title":"Improving GPU performance via large warps and two-level warp scheduling","volume-title":"Proc. 44th Annu. IEEE\/ACM Int. Symp. Microarchitecture (MICRO)","author":"Narasiman"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2012.6237006"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00040"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2013.6494995"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.89"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2004.1310763"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2013.6522353"},{"key":"ref235","first-page":"296","article-title":"Hardware transactional memory for GPU architectures","volume-title":"Proc. 44th Annu. IEEE\/ACM Int. Symp. Microarchitecture (MICRO)","author":"Fung"},{"key":"ref236","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2013.82"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751232"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1109\/IA351965.2020.00010"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2012.17"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1007\/BFb0057970"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1145\/2903150.2903155"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2013.6494989"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1145\/2384616.2384625"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1145\/2145816.2145844"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1145\/2737924.2737962"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1145\/2038037.1941574"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-28869-2_16"},{"key":"ref248","doi-asserted-by":"publisher","DOI":"10.1023\/A:1014230429447"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1023\/A:1007559022013"},{"key":"ref250","volume-title":"Supercompilers for Parallel and Vector Computers","author":"Zima","year":"1990"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.1998.742766"},{"key":"ref252","doi-asserted-by":"publisher","DOI":"10.1145\/951710.951714"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751247"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-21487-5_9"},{"key":"ref255","doi-asserted-by":"publisher","DOI":"10.1145\/3528425.3529100"},{"key":"ref256","doi-asserted-by":"publisher","DOI":"10.1145\/2597652.2597682"},{"key":"ref257","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3124856"},{"key":"ref258","doi-asserted-by":"publisher","DOI":"10.1109\/ICPP.1993.165"},{"key":"ref259","doi-asserted-by":"publisher","DOI":"10.1109\/71.207596"},{"key":"ref260","doi-asserted-by":"publisher","DOI":"10.1117\/12.2500268"},{"key":"ref261","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC.2018.8638639"},{"key":"ref262","doi-asserted-by":"publisher","DOI":"10.1145\/195473.195557"},{"key":"ref263","doi-asserted-by":"publisher","DOI":"10.1145\/301618.301670"},{"key":"ref264","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-40889-4_6"},{"key":"ref265","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-45545-0_22"},{"key":"ref266","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.1999.807526"},{"key":"ref267","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00042"},{"key":"ref268","doi-asserted-by":"publisher","DOI":"10.1109\/TCBB.2011.68"},{"key":"ref269","doi-asserted-by":"publisher","DOI":"10.1109\/TVCG.2021.3059753"},{"key":"ref270","article-title":"Leveraging GPU batching for scalable nonlinear programming through massive Lagrangian decomposition","author":"Kim","year":"2021","journal-title":"arXiv:2106.14995"},{"key":"ref271","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196808"},{"key":"ref272","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2942918"},{"key":"ref273","doi-asserted-by":"publisher","DOI":"10.1002\/jcc.540040211"},{"key":"ref274","article-title":"The OpenMP implementation of NAS parallel benchmarks and its performance","author":"Jin"},{"key":"ref275","doi-asserted-by":"publisher","DOI":"10.2514\/3.12012"},{"key":"ref276","doi-asserted-by":"publisher","DOI":"10.1109\/99.388949"},{"key":"ref277","doi-asserted-by":"publisher","DOI":"10.1145\/2458523.2458533"},{"key":"ref278","doi-asserted-by":"publisher","DOI":"10.1016\/0301-0104(91)87066-5"},{"key":"ref279","volume-title":"Chare kernel and its implementation on multicomputers","author":"Shu","year":"1990"},{"key":"ref280","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2012.51"},{"key":"ref281","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.82"},{"key":"ref282","doi-asserted-by":"publisher","DOI":"10.1145\/2688500.2688517"},{"key":"ref283","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2010.5649549"},{"key":"ref284","doi-asserted-by":"publisher","DOI":"10.1080\/17445760802337010"},{"key":"ref285","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.51"},{"key":"ref286","doi-asserted-by":"publisher","DOI":"10.1145\/3093172.3093237"},{"key":"ref287","doi-asserted-by":"publisher","DOI":"10.1109\/XSW.2013.5"},{"key":"ref288","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2004.840491"},{"key":"ref289","doi-asserted-by":"publisher","DOI":"10.1007\/s11227-019-02841-6"},{"key":"ref290","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-25636-4_10"},{"key":"ref291","doi-asserted-by":"publisher","DOI":"10.1007\/s10766-016-0461-2"},{"key":"ref292","doi-asserted-by":"publisher","DOI":"10.1109\/UIC-ATC-ScalCom-CBDCom-IoP-SmartWorld.2016.0103"},{"key":"ref293","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2012.6402918"},{"key":"ref294","doi-asserted-by":"publisher","DOI":"10.1145\/1122971.1122990"},{"key":"ref295","first-page":"19","article-title":"Memory bandwidth and machine balance in current high performance computers","volume-title":"Proc. IEEE Comput. Soc. Tech. Committee Comput. Archit. (TCCA) Newslett.","author":"McCalpin"},{"key":"ref296","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC.2009.5306797"},{"key":"ref297","doi-asserted-by":"publisher","DOI":"10.1109\/SAAHPC.2011.29"},{"key":"ref298","doi-asserted-by":"publisher","DOI":"10.1145\/2212908.2212924"},{"key":"ref299","doi-asserted-by":"publisher","DOI":"10.1145\/1735688.1735702"},{"key":"ref300","article-title":"Introduction to the HPC challenge benchmark suite","author":"Dongarra","year":"2005"},{"key":"ref301","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2011.5762730"},{"key":"ref302","doi-asserted-by":"publisher","DOI":"10.1088\/1742-6596\/1740\/1\/012056"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6287639\/10380310\/10458910.pdf?arnumber=10458910","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,12,11]],"date-time":"2024-12-11T02:04:03Z","timestamp":1733882643000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10458910\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":302,"URL":"https:\/\/doi.org\/10.1109\/access.2024.3372990","relation":{},"ISSN":["2169-3536"],"issn-type":[{"value":"2169-3536","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]}}}