{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,24]],"date-time":"2025-06-24T17:06:35Z","timestamp":1750784795657},"reference-count":91,"publisher":"Elsevier","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1016\/b978-0-12-420232-0.00005-2","type":"book-chapter","created":{"date-parts":[[2014,1,11]],"date-time":"2014-01-11T14:30:15Z","timestamp":1389450615000},"page":"203-251","source":"Crossref","is-referenced-by-count":4,"title":["Manual Parallelization Versus State-of-the-Art Parallelization Techniques"],"prefix":"10.1016","author":[{"given":"Aleksandar","family":"Vitorovi\u0107","sequence":"first","affiliation":[]},{"given":"Milo V.","family":"Toma\u0161evi\u0107","sequence":"additional","affiliation":[]},{"given":"Veljko M.","family":"Milutinovi\u0107","sequence":"additional","affiliation":[]}],"member":"78","reference":[{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0005","series-title":"Parallel Computer Architecture: A Hardware\/Software Approach","author":"Culler","year":"1998"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0010","series-title":"Chip Multiprocessor Architecture Techniques to Improve Throughput and Latency","author":"Olukotun","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0015","series-title":"The Landscape of Parallel Computing Research: A View from Berkeley","author":"Asanovic","year":"2006"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0020","unstructured":"SPEC CPU2006. http:\/\/www.spec.org\/cpu2006\/ (accessed 23 July 2013)."},{"issue":"4","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0025","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/1186736.1186737","article-title":"SPEC CPU2006 benchmark descriptions","volume":"34","author":"Henning","year":"2006","journal-title":"SIGARCH Comput. Archit. News"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0030","series-title":"Parallel Programming in OpenMP","author":"Chandra","year":"2001"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0035","first-page":"53","article-title":"Parallelization of DOALL and DOACROSS loops\u2014a survey","volume":"45","author":"Hurson","year":"1997"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0040","series-title":"Optimizing Compilers for Modern Architectures: A Dependence-Based Approach","author":"Allen","year":"2001"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0045","series-title":"Compilers: Principles, Techniques, and Tools Used","author":"Aho","year":"1986"},{"issue":"2","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0050","doi-asserted-by":"crossref","first-page":"160","DOI":"10.1109\/71.752782","article-title":"The LRPD test: speculative run-time parallelization of loops with privatization and reduction parallelization","volume":"10","author":"Rauchwerger","year":"1999","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0055","article-title":"MapReduce: simplified data processing on large clusters","author":"Dean","year":"2004"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0060","unstructured":"M. Wolfe, Understanding the CUDA data parallel threading model. http:\/\/www.pgroup.com\/lit\/articles\/insider\/v2n1a5.htm (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0065","unstructured":"Message Passing Interface (MPI) tutorial. https:\/\/computing.llnl.gov\/tutorials\/mpi\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0070","unstructured":"OpenMP tutorial. https:\/\/computing.llnl.gov\/tutorials\/openMP\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0075","first-page":"212","article-title":"The implementation of the Cilk-5 multithreaded language","author":"Frigo","year":"1998"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0080","first-page":"522","article-title":"The Cilk++ concurrency platform","author":"Leiserson","year":"2009"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0085","first-page":"112","article-title":"The Sisal model of functional programming and its implementation","author":"Gaudiot","year":"1997"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0090","series-title":"Introduction to UPC and Language Specification","author":"Carlson","year":"1999"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0095","series-title":"Intel Threading Building Blocks: Outfitting C++ for Multicore Processor Parallelism","author":"Reinders","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0100","series-title":"Languages and Compilers for Parallel Computing","first-page":"193","article-title":"STAPL: an adaptive, generic parallel C++ library","author":"An","year":"2001"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0105","series-title":"Algorithmic Skeletons: Structured Management of Parallel Computation","author":"Cole","year":"1991"},{"issue":"12","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0110","doi-asserted-by":"crossref","first-page":"1135","DOI":"10.1002\/spe.1026","article-title":"A survey of algorithmic skeleton frameworks: high\u2010level structured parallel programming enablers","volume":"40","author":"Gonz\u00e1lez\u2010V\u00e9lez","year":"2010","journal-title":"Softw. Pract. Exp."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0115","first-page":"289","article-title":"Skandium: multi-core programming with algorithmic skeletons","author":"Leyton","year":"2010"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0120","first-page":"764","article-title":"Two fundamental concepts in skeletal parallel programming","author":"Benoit","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0125","article-title":"Consistency analysis in bloom: a CALM and collected approach","author":"Alvaro","year":"2011"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0130","doi-asserted-by":"crossref","DOI":"10.1145\/1272996.1273005","article-title":"Dryad: distributed data-parallel programs from sequential building blocks","author":"Isard","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0135","unstructured":"Apache\u2122 Hadoop\u00ae. http:\/\/hadoop.apache.org\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0140","unstructured":"MathWorks MATLAB. http:\/\/www.mathworks.com\/products\/matlab\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0145","unstructured":"M. Griebl, Automatic parallelization of loop programs for distributed memory architectures, habilitation thesis, University of Passau, 2004."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0150","first-page":"23","article-title":"Efficient code generation for automatic parallelization and optimization","author":"Bastoul","year":"2003"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0155","series-title":"CHiLL: A Framework for Composing High-Level Loop Transformations","author":"Chen","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0160","first-page":"458","article-title":"Kremlin: rethinking and rebooting gprof for the multicore age","author":"Garcia","year":"2011"},{"issue":"12","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0165","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/2.546613","article-title":"Maximizing multiprocessor performance with the SUIF compiler","volume":"29","author":"Hall","year":"1996","journal-title":"Computer"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0170","series-title":"Polaris: An Optimizing Compiler for Parallel Workstations and Scalable Multiprocessors","author":"Padua","year":"1996"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0175","first-page":"243","article-title":"The value evolution graph and its use in memory reference analysis","author":"Rus","year":"2004"},{"issue":"12","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0180","doi-asserted-by":"crossref","first-page":"36","DOI":"10.1109\/MC.2009.385","article-title":"Cetus: a source-to-source compiler infrastructure for multicores","volume":"42","author":"Dave","year":"2009","journal-title":"Computer"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0185","unstructured":"D. Quinlan, et al., ROSE user manual: a tool for building source-to-source translators. http:\/\/rosecompiler.org\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0190","first-page":"189","article-title":"Towards automatic translation of OpenMP to MPI","author":"Basumallik","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0195","unstructured":"DMS Software Reengineering Toolkit. http:\/\/www.semdesigns.com\/products\/DMS\/DMSToolkit.html (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0200","first-page":"103","article-title":"Nanos mercurium: a research compiler for OpenMP","author":"Gonz\u00e0lez","year":"2004"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0205","first-page":"287","article-title":"Merge: a programming model for heterogeneous multi-core systems abstract","author":"Linderman","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0210","first-page":"137","article-title":"Compiler and runtime support for enabling generalized reduction computations on heterogeneous parallel configurations","author":"Ravi","year":"2010"},{"issue":"6","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0215","doi-asserted-by":"crossref","first-page":"592","DOI":"10.1007\/s10766-008-0085-2","article-title":"A compile\/run-time environment for the automatic transformation of linked list data structures","volume":"36","author":"van der Spek","year":"2008","journal-title":"Int. J. Parallel Prog."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0220","series-title":"Defining and Implementing Commutativity Conditions for Parallel Execution","author":"Kulkarni","year":"2009"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0225","series-title":"Introduction to Algorithms","author":"Cormen","year":"2003"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0230","first-page":"68","article-title":"Open nesting in software transactional memory","author":"Ni","year":"2007"},{"issue":"5\u20136","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0235","doi-asserted-by":"crossref","first-page":"361","DOI":"10.1007\/s10766-010-0139-0","article-title":"Semantic-aware automatic parallelization of modern applications using high-level abstractions","volume":"38","author":"Liao","year":"2010","journal-title":"Int. J. Parallel Prog."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0240","first-page":"1","article-title":"Parallelizing irregular C codes assisted by interprocedural shape analysis","author":"Asenjo","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0245","doi-asserted-by":"crossref","DOI":"10.1145\/1542476.1542496","article-title":"Towards a holistic approach to auto-parallelization: integrating profile-driven parallelism detection and machine-learning based mapping","author":"Tournavitis","year":"2009"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0250","unstructured":"TILE64 Processor. http:\/\/www.tilera.com\/products\/processors\/TILE64 (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0255","first-page":"278","article-title":"Dynamic hammock predication for non-predicated instruction set architectures","author":"Klauser","year":"1998"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0260","first-page":"92","article-title":"A framework for balancing control flow and predication","author":"August","year":"1997"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0265","first-page":"367","article-title":"Profile-assisted compiler support for dynamic predication in diverge-merge processors","author":"Kim","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0270","first-page":"201","article-title":"Lightweight predication support for out of order processors","author":"Stephenson","year":"2009"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0275","first-page":"227","article-title":"Integrated predicated and speculative execution in the IMPACT EPIC architecture","author":"August","year":"1998"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0280","article-title":"Toward thread-level speculation for coarse-grained parallelism with regular access patterns","author":"Ramaseshan","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0285","first-page":"77","article-title":"Exploring speculative parallelism in SPEC2006","author":"Packirisamy","year":"2009"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0290","first-page":"147","article-title":"Heuristics for profile-driven method-level speculative parallelization","author":"Whaley","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0295","series-title":"Languages and Compilers for Parallel Computing","first-page":"232","article-title":"Compiler-driven dependence profiling to guide program parallelization","author":"Wu","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0300","first-page":"1","article-title":"Experiences of using a dependence profiler to assist parallelization for multi-cores","author":"Das","year":"2010"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0305","first-page":"105","article-title":"Automatic thread extraction with decoupled software pipelining","author":"Ottoni","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0310","first-page":"49","article-title":"Speculative decoupled software pipelining","author":"Vachharajani","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0315","first-page":"69","article-title":"Revisiting the sequential programming model for multi-core","author":"Bridges","year":"2007"},{"issue":"12","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0320","doi-asserted-by":"crossref","first-page":"58","DOI":"10.1145\/1610252.1610271","article-title":"The Bulk Multicore architecture for improved programmability","volume":"52","author":"Torrellas","year":"2009","journal-title":"Commun. ACM"},{"issue":"1","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0325","doi-asserted-by":"crossref","first-page":"1","DOI":"10.2200\/S00070ED1V01Y200611CAC002","article-title":"Transactional memory","volume":"1","author":"Larus","year":"2006","journal-title":"Synth. Lect. Comput. Archit."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0330","first-page":"65","article-title":"Speculative parallelization using software multi-threaded transactions","author":"Raman","year":"2010"},{"issue":"10","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0335","doi-asserted-by":"crossref","first-page":"50","DOI":"10.1145\/1400181.1400197","article-title":"A closer look at GPUs","volume":"51","author":"Fatahalian","year":"2008","journal-title":"Commun. ACM"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0340","unstructured":"Tuning MPI programs for peak performance. http:\/\/www.mcs.anl.gov\/research\/projects\/mpi\/tutorial\/perf\/mpiperf\/index.htm (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0345","unstructured":"V. Packirisamy, Efficient architecture support for thread-level speculation, University of Minnesota, PhD Thesis, April 2009."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0350","unstructured":"ParBenCCh 1.0 Parallel C++ Benchmarking Suite. https:\/\/asc.llnl.gov\/computing_resources\/purple\/archive\/benchmarks\/parbencch\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0355","first-page":"24","article-title":"The SPLASH-2 programs: characterization and methodological considerations","author":"Woo","year":"1995"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0360","first-page":"348","article-title":"Experiments with auto-parallelizing SPEC2000FP benchmarks","volume":"vol. 3602","author":"Zhang","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0365","article-title":"Uncovering hidden loop level parallelism in sequential applications","author":"Zhong","year":"2008"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0370","first-page":"142","article-title":"Exposing speculative thread parallelism in SPEC2000","author":"Prabhu","year":"2005"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0375","first-page":"215","article-title":"Tight analysis of the performance potential of thread speculation using spec CPU 2006","author":"Kejariwal","year":"2007"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0380","unstructured":"BioBench and BioParallel: a benchmark suite for bioinformatics applications. http:\/\/www.ece.umd.edu\/biobench\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0385","unstructured":"NASA Supercomputing Benchmarks. https:\/\/www.nas.nasa.gov\/cgi-bin\/software\/start (accessed 23 July 2013)."},{"issue":"1","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0390","doi-asserted-by":"crossref","first-page":"77","DOI":"10.1145\/1241601.1241617","article-title":"C++ benchmarks in SPEC CPU2006","volume":"35","author":"Wong","year":"2007","journal-title":"SIGARCH Comput. Archit. News"},{"issue":"16","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0395","doi-asserted-by":"crossref","first-page":"1781","DOI":"10.1002\/jcc.20289","article-title":"Scalable molecular dynamics with NAMD","volume":"26","author":"Phillips","year":"2005","journal-title":"J. Comput. Chem."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0400","unstructured":"J. Phillips, NAMD serial and parallel performance. http:\/\/www.ks.uiuc.edu\/Research\/namd\/tutorial\/PSC2001\/pdf\/performance.pdf (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0405","unstructured":"deal.II Homepage. http:\/\/www.dealii.org\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0410","unstructured":"SoPlex, SMoPlex and DoPlex: parallel and object-oriented simplex algorithms. http:\/\/typo.zib.de\/vis-long_projects\/par\/simplex.html (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0415","unstructured":"POV-Ray\u2014the persistence of vision raytracer. http:\/\/www.povray.org\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0420","unstructured":"PVM patch for POV-Ray. http:\/\/pvmpov.sourceforge.net\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0425","unstructured":"OMNeT++ Network Simulation Framework. http:\/\/omnetpp.org\/ (accessed 23 July 2013)."},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0430","first-page":"82","article-title":"Parallel Astar search on message-passing architectures","volume":"vol. 1","author":"Cvetanovic","year":"1990"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0435","unstructured":"Berkeley Parallel Browser Project. http:\/\/parallelbrowser.blogspot.ch\/ (accessed 23 July 2013)."},{"issue":"1","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0440","doi-asserted-by":"crossref","first-page":"102","DOI":"10.1145\/1241601.1241621","article-title":"Subroutine profiling results for the CPU2006 benchmarks","volume":"35","author":"Weicker","year":"2007","journal-title":"SIGARCH Comput. Archit. News"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0445","unstructured":"GNU gprof. http:\/\/www.cs.utah.edu\/dept\/old\/texinfo\/as\/gprof.html (accessed 23 July 2013)."},{"issue":"1","key":"10.1016\/B978-0-12-420232-0.00005-2_bb0450","doi-asserted-by":"crossref","first-page":"130","DOI":"10.1145\/1241601.1241625","article-title":"SPEC CPU2006 benchmark tools","volume":"35","author":"Spradling","year":"2007","journal-title":"SIGARCH Comput. Archit. News"},{"key":"10.1016\/B978-0-12-420232-0.00005-2_bb0455","series-title":"An Introduction to Parallel Algorithms","author":"JaJa","year":"1992"}],"container-title":["Advances in Computers"],"original-title":[],"deposited":{"date-parts":[[2019,8,6]],"date-time":"2019-08-06T11:08:39Z","timestamp":1565089719000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/B9780124202320000052"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"references-count":91,"URL":"https:\/\/doi.org\/10.1016\/b978-0-12-420232-0.00005-2","relation":{},"ISSN":["0065-2458"],"issn-type":[{"value":"0065-2458","type":"print"}],"subject":[],"published":{"date-parts":[[2014]]}}}