{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,22]],"date-time":"2026-03-22T22:47:37Z","timestamp":1774219657927,"version":"3.50.1"},"reference-count":84,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"10","license":[{"start":{"date-parts":[[2018,10,1]],"date-time":"2018-10-01T00:00:00Z","timestamp":1538352000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"}],"funder":[{"name":"KAUST Extreme Computing Research Center"},{"name":"KAUST Supercomputing Laboratory"},{"name":"KAUST Information Technology Research Division"},{"name":"Intel Parallel Computing Centers"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Parallel Distrib. Syst."],"published-print":{"date-parts":[[2018,10,1]]},"DOI":"10.1109\/tpds.2018.2826533","type":"journal-article","created":{"date-parts":[[2018,4,13]],"date-time":"2018-04-13T19:10:27Z","timestamp":1523646627000},"page":"2317-2332","source":"Crossref","is-referenced-by-count":16,"title":["Optimizations of Unstructured Aerodynamics Computations for Many-core Architectures"],"prefix":"10.1109","volume":"29","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4988-4674","authenticated-orcid":false,"given":"Mohammed A.","family":"Al Farhan","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4052-7224","authenticated-orcid":false,"given":"David E.","family":"Keyes","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2014.2365784"},{"key":"ref72","first-page":"657","article-title":"A study of main-memory hash joins on many-core processor: A case with Intel Knights Landing architecture","author":"cheng","year":"2017","journal-title":"Proc 26th Int Conf Inf Knowl Manage"},{"key":"ref71","first-page":"24:1","article-title":"Efficient SIMD and MIMD parallelization of hash-based aggregation by conflict mitigation","author":"jiang","year":"2017","journal-title":"Proc 21st ACM Int Conf Supercomput"},{"key":"ref70","first-page":"9:1","article-title":"Scaling deep learning on GPU and Knights Landing clusters","author":"you","year":"2017","journal-title":"Proc 30th Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref76","first-page":"339","article-title":"Applying the roofline performance model to the Intel Xeon Phi Knights Landing processor","volume":"9945","author":"doerfler","year":"2016","journal-title":"Proc 31st Int Conf High Perform Comput"},{"key":"ref77","first-page":"26:1","article-title":"Exploring and analyzing the real impact of modern on-package memory on HPC scientific kernels","author":"li","year":"2017","journal-title":"Proc 30th Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1145\/3064176.3064191"},{"key":"ref39","first-page":"1058","article-title":"Sparse tensor factorization on many-core processors with high-bandwidth memory","author":"smith","year":"2017","journal-title":"Proc 31st IEEE Int Parallel Distrib Process Symp"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1145\/2493123.2462916"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1145\/2807591.2807644"},{"key":"ref78","first-page":"39:1","article-title":"An efficient MPI\/OpenMP parallelization of the Hartree-Fock method for the second generation of intel&#x00AE; xeon&#x00AE; phi&#x2122; processor","author":"mironov","year":"2017","journal-title":"Proc 30th Int Conf High Perform Comput Netw Storage Anal"},{"key":"ref79","first-page":"892","article-title":"Parallel graph coloring for manycore architectures","author":"deveci","year":"2016","journal-title":"Proc Int Symp Parallel Distrib Process"},{"key":"ref33","author":"zhang","year":"2016","journal-title":"Guide to Automatic Vectorization with Intel AVX-512 Instructions in Knights Landing Processors"},{"key":"ref32","author":"valdimirov","year":"2015","journal-title":"Optimization Techniques for the Intel MIC Architecture Part 3 of 3 False Sharing and Padding"},{"key":"ref31","author":"evans","year":"2006","journal-title":"A Scalable Concurrent malloc(3) Implementation for Freebsd"},{"key":"ref30","author":"cantalupo","year":"2015","journal-title":"User extensible heap manager for heterogeneous memory platforms and mixed memory policies"},{"key":"ref37","first-page":"233","article-title":"Towards realistic performance bounds for implicit CFD codes","author":"gropp","year":"1999","journal-title":"Proc 11th Int Parallel Comput Fluid Dyn Conf"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018756"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1145\/3055399.3055490"},{"key":"ref34","author":"rahman","year":"2018"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339594"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2015.2453972"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.2514\/6.2015-1949"},{"key":"ref63","doi-asserted-by":"crossref","first-page":"343","DOI":"10.1007\/978-3-319-41321-1_18","article-title":"High order seismic simulations on the Intel Xeon Phi processor (Knights Landing)","volume":"9697","author":"heinecke","year":"2016","journal-title":"Proc 31st Int Conf High Perform Comput"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/800195.805928"},{"key":"ref64","doi-asserted-by":"crossref","first-page":"12:1","DOI":"10.1145\/3155290","article-title":"Multidimensional intratile parallelization for memory-starved stencil computations","volume":"4","author":"malas","year":"2017","journal-title":"ACM Trans on Parallel Computing"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1137\/S1064827595287997"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1137\/140991133"},{"key":"ref66","first-page":"142","article-title":"Optimization of an electromagnetics code with multicore wavefront diamond blocking and multi-dimensional intra-tile parallelization","author":"malas","year":"2016","journal-title":"Proc Parallel Distrib Process Symp"},{"key":"ref29","author":"asai","year":"2018","journal-title":"Optimization Techniques for the Intel MIC Architecture Part 1 of 3 Multi-Threading and Parallel Reduction"},{"key":"ref67","first-page":"234","article-title":"Towards highly scalable Ab initio molecular dynamics (AIMD) simulations on the Intel Knights Landing manycore processor","author":"jacquelin","year":"2017","journal-title":"Proc 31st IEEE Int Parallel Distrib Process Symp"},{"key":"ref68","first-page":"1","article-title":"Optimizing and tuning the fast multipole method for state-of-the-art multicore architectures","author":"chandramowlishwaran","year":"2010","journal-title":"Proc 24th Parallel Distrib Process Symp"},{"key":"ref69","first-page":"8:1","article-title":"Compile-time optimized and statically scheduled N-D convnet primitives for multi-core and many-core (xeon phi) cpus","author":"zlateski","year":"2017","journal-title":"Proc 21st ACM Int Conf Supercomput"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1145\/2751205.2751241"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2016.2516540"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1137\/S0036142996304796"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/InPar.2012.6339601"},{"key":"ref21","first-page":"213","article-title":"Optimization and parallelization of B-Spline based orbital evaluations in QMC on multi\/many-core shared memory processors","author":"mathuriya","year":"2017","journal-title":"Proc 31st IEEE Int Parallel Distrib Process Symp"},{"key":"ref24","author":"valdimirov","year":"2018","journal-title":"Optimization Techniques for the Intel MIC Architecture Part 2 of 3 Strip-Mining for Vectorization"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1016\/B978-0-12-802118-7.00010-8"},{"key":"ref26","article-title":"METIS Web page","author":"karypis","year":"2013"},{"key":"ref25","author":"asai","year":"2017","journal-title":"Optimization of Hamerly&#x2019;s K-Means Clustering Algorithm CFXKMeans Library"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178500"},{"key":"ref51","first-page":"553","article-title":"Performance evaluation of computation and communication kernels of the fast multipole method on intel manycore architecture","volume":"10417","author":"abduljabbar","year":"2017","journal-title":"Proc 23rd Int Eur Conf Parallel Distrib Comput"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.2514\/1.J053813"},{"key":"ref58","author":"vladimirov","year":"2017","journal-title":"A Survey and Benchmarks of Intel Xeon Gold and Platinum Processors"},{"key":"ref57","author":"kasliwal","year":"2017","journal-title":"A Performance-Based Comparison of C\/C++ Compilers"},{"key":"ref56","author":"ragate","year":"2017","journal-title":"Optimization of Real-Time Object Detection on Intel&#x00AE; Xeon&#x00AE; Scalable Processors"},{"key":"ref55","author":"eltablawy","year":"2017","journal-title":"Capabilities of Intel&#x00AE; AVX-512 in Intel&#x00AE; Xeon&#x00AE; Scalable Processors (Skylake)"},{"key":"ref54","author":"mccalpin","year":"2018","journal-title":"Stream Sustainable Memory Bandwidth in High Performance Computers"},{"key":"ref53","first-page":"19","article-title":"Memory bandwidth and machine balance in high performance computers","author":"mccalpin","year":"1995","journal-title":"IEEE Tech Committee Comput Archit Newslett"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1145\/1465482.1465560"},{"key":"ref10","article-title":"PETSc Web page","author":"balay","year":"2016"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1016\/S0167-8191(00)00075-2"},{"key":"ref40","year":"2017"},{"key":"ref12","article-title":"Achieving high sustained performance in an unstructured mesh CFD application","author":"anderson","year":"1999","journal-title":"Proc ACM\/IEEE Conf Supercomput"},{"key":"ref13","first-page":"12","article-title":"Hybrid programming model for implicit PDE simulations on multicore architectures","author":"kaushik","year":"2011","journal-title":"Proc 7th Int Workshop OpenMP"},{"key":"ref14","first-page":"723","article-title":"Exploring shared-memory optimizations for an unstructured mesh CFD application on modern parallel systems","author":"mudigere","year":"2015","journal-title":"Proc Int Symp Parallel Distrib Process"},{"key":"ref15","author":"jeffers","year":"2013","journal-title":"Intel Xeon Phi Coprocessor High Performance Programming"},{"key":"ref82","doi-asserted-by":"crossref","first-page":"79","DOI":"10.1007\/978-3-319-58667-0_5","article-title":"Communication reducing algorithms for distributed hierarchical N-Body problems with boundary distributions","volume":"10266","author":"abduljabbar","year":"2017","journal-title":"Proc 32nd Int Conf High Perform Comput"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2016.06.001"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1145\/3018743.3018763"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.jcp.2003.08.010"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178513"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1177\/109434200001400202"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1137\/140968896"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1137\/S106482750241044X"},{"key":"ref80","first-page":"65","article-title":"Extending a C-like language for portable SIMD programming","author":"lei\u00dfa","year":"2012","journal-title":"Proc 17th ACM SIGPLAN Symp Principles Practice Parallel Program"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1006\/jcph.1996.0219"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1016\/0045-7930(94)90023-X"},{"key":"ref6","first-page":"18","article-title":"An optimized multicolor point-implicit solver for unstructured grid applications on graphics processing units","author":"zubair","year":"2016","journal-title":"Proc 4th Workshop Irregular Appl Archit Algorithms"},{"key":"ref5","article-title":"Production Level CFD Code Acceleration for Hybrid Many-Core Architectures","author":"duffy","year":"2012"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4612-1986-6_8"},{"key":"ref7","article-title":"FUN3D Web page","author":"anderson","year":"2017"},{"key":"ref49","first-page":"145","article-title":"Implementing database operations using simd instructions","author":"zhou","year":"2002","journal-title":"Proc Int Conf Manage Data"},{"key":"ref9","author":"balay","year":"2018"},{"key":"ref46","first-page":"297","article-title":"Capability models for manycore memory systems: A case-study with Xeon Phi KNL","author":"ramos","year":"2017","journal-title":"Proc Int Symp Parallel Distrib Process"},{"key":"ref45","author":"codreanu","year":"2017","journal-title":"Best Practice Guide - Knights Landing"},{"key":"ref48","article-title":"Intel advisor 2017","year":"2017"},{"key":"ref47","article-title":"Intel&#x00AE; xeon&#x00AE; processor scalable family technical overview","author":"mulnix","year":"2017"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2016.25"},{"key":"ref41","author":"jeffers","year":"2016"},{"key":"ref44","author":"asai","year":"2016","journal-title":"Clustering Modes in Knights Landing Processors Developer&#x2019;s Guide"},{"key":"ref43","author":"asai","year":"2016","journal-title":"MCDRAM as High-Bandwidth Memory (HBM) in Knights Landing Processors Developer&#x2019;s Guide"}],"container-title":["IEEE Transactions on Parallel and Distributed Systems"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/71\/8458269\/08337750.pdf?arnumber=8337750","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,2]],"date-time":"2023-09-02T03:28:25Z","timestamp":1693625305000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8337750\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018,10,1]]},"references-count":84,"journal-issue":{"issue":"10"},"URL":"https:\/\/doi.org\/10.1109\/tpds.2018.2826533","relation":{},"ISSN":["1045-9219","1558-2183","2161-9883"],"issn-type":[{"value":"1045-9219","type":"print"},{"value":"1558-2183","type":"electronic"},{"value":"2161-9883","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018,10,1]]}}}