{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T08:40:03Z","timestamp":1748335203507,"version":"3.41.0"},"reference-count":40,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[1999,12,1]],"date-time":"1999-12-01T00:00:00Z","timestamp":944006400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[1999,12,1]],"date-time":"1999-12-01T00:00:00Z","timestamp":944006400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["International Journal of Parallel Programming"],"published-print":{"date-parts":[[1999,12]]},"DOI":"10.1023\/a:1018780200739","type":"journal-article","created":{"date-parts":[[2003,2,19]],"date-time":"2003-02-19T20:54:14Z","timestamp":1045688054000},"page":"477-503","source":"Crossref","is-referenced-by-count":2,"title":["Tuning Compiler Optimizations for Simultaneous Multithreading"],"prefix":"10.1007","volume":"27","author":[{"given":"Jack L.","family":"Lo","sequence":"first","affiliation":[]},{"given":"Susan J.","family":"Eggers","sequence":"additional","affiliation":[]},{"given":"Henry M.","family":"Levy","sequence":"additional","affiliation":[]},{"given":"Sujay S.","family":"Parekh","sequence":"additional","affiliation":[]},{"given":"Dean M.","family":"Tullsen","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"290892_CR1","doi-asserted-by":"crossref","unstructured":"D. M. Tullsen, S. J. Eggers, and H. M. Levy, Simultaneous multithreading: Maximizing on-chip parallelism, 22nd Ann. Int'l. Symp. Computer Architecture, pp. 392\u2013403 (June 1995).","DOI":"10.1145\/223982.224449"},{"key":"290892_CR2","doi-asserted-by":"crossref","unstructured":"D. M. Tullsen et al., Exploiting choice: Instruction fetch and issue on an implementable simultaneous multithreading processor, 23rd Ann. Int'l. Symp. Computer Architecture, pp. 191\u2013202 (May 1996).","DOI":"10.1145\/232973.232993"},{"issue":"3","key":"290892_CR3","doi-asserted-by":"crossref","first-page":"322","DOI":"10.1145\/263326.263382","volume":"15","author":"J. L. Lo","year":"1997","unstructured":"J. L. Lo et al., Converting thread-level parallelism to instruction-level parallelism via simultaneous multithreading, ACM Trans. Computer Sys., 15(3):322\u2013354 (August 1997).","journal-title":"ACM Trans. Computer Sys."},{"issue":"5","key":"290892_CR4","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1109\/40.621209","volume":"17","author":"S. J. Eggers","year":"1997","unstructured":"S. J. Eggers et al., Simultaneous multithreading: A platform for next-generation processors, IEEE Micro, 17(5):12\u201319 (October 1997).","journal-title":"IEEE Micro"},{"key":"290892_CR5","doi-asserted-by":"crossref","unstructured":"G. Alverson et al., Tera hardware-software cooperation, Proc. ACM Int'l. Conf. Supercomputing (November 1997).","DOI":"10.1145\/509593.509631"},{"issue":"4","key":"290892_CR6","doi-asserted-by":"crossref","first-page":"452","DOI":"10.1109\/71.97902","volume":"2","author":"M. E. Wolf","year":"1991","unstructured":"M. E. Wolf and M. S. Lam, A loop transformation theory and an algorithm to maximize parallelism, IEEE Trans. Parallel and Distrib. Syst., 2(4):452\u2013471 (October 1991).","journal-title":"IEEE Trans. Parallel and Distrib. Syst."},{"key":"290892_CR7","doi-asserted-by":"crossref","unstructured":"M. Cierniak and W. Li, Unifying data and control transformations for distributed sharedmemory machines, ACM SIGPLAN Conf. Progr. Lang. Design and Implementation, pp. 205\u2013217 (June 1995).","DOI":"10.1145\/207110.207145"},{"key":"290892_CR8","doi-asserted-by":"crossref","unstructured":"S. Carr, K. S. McKinley, and C. W. Tseng, Compiler optimizations for improving data locality, Sixth Int' l. Conf. Architectural Support for Progr. Lang. Operat. Syst., pp. 252\u2013262 (October 1994).","DOI":"10.1145\/195473.195557"},{"key":"290892_CR9","doi-asserted-by":"crossref","unstructured":"K. Dixit, New CPU benchmark suites from SPEC, '92 Digest of Papers, pp. 305\u2013310 (February 1992).","DOI":"10.1109\/CMPCON.1992.186729"},{"key":"290892_CR10","unstructured":"SPEC, '95 Technical Manual (August 1995)."},{"key":"290892_CR11","doi-asserted-by":"crossref","unstructured":"S. C. Woo et al., The SPLASH-2 programs: Characterization and methodological considerations, 22nd Ann. Int'l. Symp. Computer Architecture, pp. 24\u201336 (June 1995).","DOI":"10.1145\/223982.223990"},{"issue":"1","key":"290892_CR12","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1007\/BF01205182","volume":"7","author":"P. G. Lowney","year":"1993","unstructured":"P. G. Lowney et al., The multiflow trace scheduling compiler, J. Supercomputing,\n7(1\/2):51\u2013142 (May 1993).","journal-title":"J. Supercomputing"},{"issue":"12","key":"290892_CR13","doi-asserted-by":"crossref","first-page":"84","DOI":"10.1109\/2.546613","volume":"29","author":"M. W. Hall","year":"1996","unstructured":"M. W. Hall et al., Maximizing multiprocessor performance with the SUIF compiler, IEEE Computer,\n29(12):84\u201389 (December 1996).","journal-title":"IEEE Computer"},{"key":"290892_CR14","doi-asserted-by":"crossref","unstructured":"E. Bugnion et al., Compiler-directed page coloring for multiprocessors, Seventh Int'l. Conf. Architectural Support for Progr. Lang. Operat. Syst., pp. 244\u2013255 (October 1997).","DOI":"10.1145\/248208.237195"},{"key":"290892_CR15","volume-title":"Portable Programs for Parallel Processors","author":"J. Boyle","year":"1987","unstructured":"J. Boyle et al., Portable Programs for Parallel Processors, Holt, Rinehart, and Winston, Inc. (1987)."},{"key":"290892_CR16","unstructured":"S. McFarling, Combining Branch Predictors, Technical Report TN-36, DEC-Western Research Laboratory (June 1993)."},{"key":"290892_CR17","doi-asserted-by":"crossref","unstructured":"S. A. Mahlke et al., Effective compiler support for predicated execution using the hyperblock, 25th Int'l. Symp. Microarchitecture, pp. 45\u201354 (December 1992).","DOI":"10.1109\/MICRO.1992.696999"},{"key":"290892_CR18","doi-asserted-by":"crossref","unstructured":"S. Hily and A. Seznec, Out-of-order execution may not be cost-effective on processors featuring simultaneous multithreading, Fifth Int'l. Symp. High Performance Computer Architecture, pp. 64\u201367 (January 1999).","DOI":"10.1109\/HPCA.1999.744331"},{"key":"290892_CR19","doi-asserted-by":"crossref","unstructured":"M. S. Lam, E. E. Rothberg, and M. E. Wolf, The cache performance and optimizations of blocked algorithms, Fourth Int'l. Conf. Architectural Support for Progr. Lang. Operat. Syst., pp. 63\u201374 (April 1991).","DOI":"10.1145\/106972.106981"},{"key":"290892_CR20","doi-asserted-by":"crossref","unstructured":"S. Coleman and K. S. McKinley, Tile size selection using cache organization and data layout, ACM SIGPLAN Conf. Progr. Lang. Design and Implementation, pp. 279\u2013290 (June 1995).","DOI":"10.1145\/207110.207162"},{"key":"290892_CR21","doi-asserted-by":"crossref","unstructured":"S. Carr and K. Kennedy, Compiler blockability of numerical algorithms, Proc. ACM Int'l. Conf. Supercomputing, pp. 114\u2013124 (November 1992).","DOI":"10.1109\/SUPERC.1992.236704"},{"key":"290892_CR22","doi-asserted-by":"crossref","unstructured":"L. Carter, J. Ferrante, and S. F. Hummel, Hierarchical tiling for improved superscalar performance, Proc. Ninth Int'l. Parallel Processing Symp., pp. 239\u2013245 ( April 1995).","DOI":"10.1109\/IPPS.1995.395939"},{"key":"290892_CR23","doi-asserted-by":"crossref","unstructured":"T. C. Mowry, M. S. Lam, and A. Gupta, Design and evaluation of a compiler algorithm for prefetching, Fifth Int' l. Conf. Architectural Support for Progr. Lang. Operat. Syst., pp. 62\u201375 (September 1992).","DOI":"10.1145\/143365.143488"},{"key":"290892_CR24","doi-asserted-by":"crossref","unstructured":"P. Hsu and E. Davidson, Highly concurrent scalar processing, 13th Ann. Int'l. Symp. Computer Architecture, pp. 386\u2013395 (June 1986).","DOI":"10.1145\/17356.17401"},{"key":"290892_CR25","doi-asserted-by":"crossref","first-page":"12","DOI":"10.1109\/2.19820","volume":"22","author":"B. R. Rau","year":"1989","unstructured":"B. R. Rau et al., The Cydra 5 departmental supercomputer, IEEE Computer,\n22:12\u201335 (January 1989).","journal-title":"IEEE Computer"},{"key":"290892_CR26","doi-asserted-by":"crossref","unstructured":"J. Allen et al., Conversion of control dependence to data dependence, Conf. Record of the Tenth Ann. ACM Symp. Principles Progr. Lang., pp. 177\u2013189 ( January 1983).","DOI":"10.1145\/567067.567085"},{"issue":"9","key":"290892_CR27","doi-asserted-by":"crossref","first-page":"18","DOI":"10.1109\/C-M.1981.220595","volume":"14","author":"A. E. Charlesworth","year":"1981","unstructured":"A. E. Charlesworth, An approach to scientific array processing: The architectural design of the AP-120B\/FPS-164 family, IEEE Computer,\n14(9):18\u201327 (December 1981).","journal-title":"IEEE Computer"},{"key":"290892_CR28","doi-asserted-by":"crossref","unstructured":"B. R. Rau and C. Glaeser, Some scheduling techniques and an easily schedulable horizontal architecture for high performance scientific computing, 14th Ann. Workshop on Microprogr., pp. 183\u2013197 (October 1981).","DOI":"10.1145\/1014192.802449"},{"key":"290892_CR29","doi-asserted-by":"crossref","unstructured":"M. Lam, Software pipelining: An effective scheduling technique for VLIW machines, ACM SIGPLAN Conf. Progr. Lang. Design and Implementation, pp. 318\u2013328 (June 1988).","DOI":"10.1145\/53990.54022"},{"key":"290892_CR30","doi-asserted-by":"crossref","unstructured":"A. Aiken and A. Nicolau, Optimal loop parallelization, ACM SIGPLAN Conf. Progr. Lang. Design and Implementation, pp. 308\u2013317 (June 1988).","DOI":"10.1145\/53990.54021"},{"key":"290892_CR31","doi-asserted-by":"crossref","unstructured":"D. M. Tullsen et al., Supporting fine-grained synchronization on a simultaneous multithreading processor, Fifth Int'l. Symp. High Performance Computer Architecture, pp. 54\u201358 (January 1999).","DOI":"10.1109\/HPCA.1999.744326"},{"key":"290892_CR32","doi-asserted-by":"crossref","unstructured":"J. M. Anderson, S. P. Amarasinghe, and M. S. Lam, Data and computation transformations for multiprocessors, Fifth ACM Sigplan Symp. Principles Practice Parallel Progr., pp. 166\u2013178 (July 1995).","DOI":"10.1145\/209936.209954"},{"issue":"12","key":"290892_CR33","doi-asserted-by":"crossref","first-page":"229","DOI":"10.1007\/BF01205185","volume":"7","author":"W. W. Hwu","year":"1993","unstructured":"W. W. Hwu et al., The superblock: An effective technique for VLIW and superscalar compilation, J. Supercomputing,\n7(12):229\u2013248 (May 1993).","journal-title":"J. Supercomputing"},{"key":"290892_CR34","doi-asserted-by":"crossref","unstructured":"M. Lam and R. Wilson, Limits of control flow on parallelism, 19th Ann. Int'l. Symp. Computer Architecture, pp. 46\u201357 (May 1992).","DOI":"10.1109\/ISCA.1992.753303"},{"issue":"5","key":"290892_CR35","doi-asserted-by":"crossref","first-page":"587","DOI":"10.1016\/0743-7315(88)90014-7","volume":"5","author":"D. Gannon","year":"1988","unstructured":"D. Gannon, W. Jalby, and K. Gallivan, Strategies for cache and local memory management by global program transformation, J. Parallel Distribut. Comput., 5(5):587\u2013616 (October 1988).","journal-title":"J. Parallel Distribut. Comput."},{"key":"290892_CR36","doi-asserted-by":"crossref","unstructured":"M. E. Wolf and M. S. Lam, A data locality optimizing algorithm, ACM SIGPLAN Conf. Progr. Lang. Design and Implementation, pp. 30\u201344 (June 1991).","DOI":"10.1145\/113445.113449"},{"key":"290892_CR37","doi-asserted-by":"crossref","unstructured":"K. Kennedy and K. S. McKinley, Maximizing loop parallelism and improving data locality via loop fusion and distribution, Sixth Int'l. Workshop Languages and Compilers for Parallel Computing, pp. 301\u2013319 (August 1993).","DOI":"10.1007\/3-540-57659-2_18"},{"key":"290892_CR38","doi-asserted-by":"crossref","unstructured":"R. Alverson et al., The Tera computer system, Proc. ACM Int'l. Conf. Supercomputing, pp. 1\u20136 (June 1990).","DOI":"10.1145\/255129.255132"},{"key":"290892_CR39","unstructured":"T. N. Vijaykumar and G. S. Sohi, Task selection for a multiscalar processor, 31st Int'l. Symp. Microarchitecture (November 1998)."},{"issue":"7","key":"290892_CR40","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1109\/MC.1993.274941","volume":"27","author":"J. P. Singh","year":"1993","unstructured":"J. P. Singh, J. L. Hennessy, and A. Gupta, Scaling parallel programs for multiprocessors: Methodology and examples, IEEE Computer, 27(7):42\u201350 (July 1993).","journal-title":"IEEE Computer"}],"container-title":["International Journal of Parallel Programming"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1023\/A:1018780200739.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1023\/A:1018780200739\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1023\/A:1018780200739.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T08:16:00Z","timestamp":1748333760000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1023\/A:1018780200739"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[1999,12]]},"references-count":40,"journal-issue":{"issue":"6","published-print":{"date-parts":[[1999,12]]}},"alternative-id":["290892"],"URL":"https:\/\/doi.org\/10.1023\/a:1018780200739","relation":{},"ISSN":["0885-7458","1573-7640"],"issn-type":[{"type":"print","value":"0885-7458"},{"type":"electronic","value":"1573-7640"}],"subject":[],"published":{"date-parts":[[1999,12]]}}}