{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,5,28]],"date-time":"2025-05-28T04:18:28Z","timestamp":1748405908359,"version":"3.41.0"},"publisher-location":"Cham","reference-count":33,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319174723"},{"type":"electronic","value":"9783319174730"}],"license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015]]},"DOI":"10.1007\/978-3-319-17473-0_6","type":"book-chapter","created":{"date-parts":[[2015,4,30]],"date-time":"2015-04-30T09:59:39Z","timestamp":1430387979000},"page":"82-97","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3,"title":["Understanding Co-run Degradations on Integrated Heterogeneous Processors"],"prefix":"10.1007","author":[{"given":"Qi","family":"Zhu","sequence":"first","affiliation":[]},{"given":"Bo","family":"Wu","sequence":"additional","affiliation":[]},{"given":"Xipeng","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Li","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Zhiying","family":"Wang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,5,1]]},"reference":[{"issue":"4","key":"6_CR1","doi-asserted-by":"publisher","first-page":"379","DOI":"10.1109\/71.273046","volume":"5","author":"EP Markatos","year":"1994","unstructured":"Markatos, E.P., LeBlanc, T.J.: Using processor affinity in loop scheduling on shared-memory multiprocessors. IEEE Trans. Parallel Distrib. Syst. 5(4), 379\u2013400 (1994)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"2","key":"6_CR2","doi-asserted-by":"publisher","first-page":"131","DOI":"10.1109\/71.207589","volume":"4","author":"MS Squillante","year":"1993","unstructured":"Squillante, M.S., Lazowska, E.D.: Using processor-cache affinity information in shared-memory multiprocessor scheduling. IEEE Trans. Parallel Distrib. Syst. 4(2), 131\u2013143 (1993)","journal-title":"IEEE Trans. Parallel Distrib. Syst."},{"issue":"1","key":"6_CR3","doi-asserted-by":"publisher","first-page":"347","DOI":"10.1145\/1735970.1736059","volume":"38","author":"I Gelado","year":"2010","unstructured":"Gelado, I., Stone, J.E., Cabezas, J., et al.: An asymmetric distributed shared memory model for heterogeneous parallel systems. ACM SIGARCH Comput. Archit. News (ACM) 38(1), 347\u2013358 (2010)","journal-title":"ACM SIGARCH Comput. Archit. News (ACM)"},{"key":"6_CR4","unstructured":"George, V., Engineer, S.P., Piazza, T., et al.: Technology Insight: Intel Next Generation Microarchitecture Codename Ivy Bridge (2011)"},{"key":"6_CR5","unstructured":"Amd, APP SDK 2.4. http:\/\/developer.amd.com\/amd-license-agreement\/?f=AMD-APP-SDK-v2.4-Windows-64.exe"},{"key":"6_CR6","doi-asserted-by":"crossref","unstructured":"Jiang, Y., Shen, X., Chen, J., et al.: Analysis and approximation of optimal co-scheduling on chip multiprocessors. In: Proceedings of the 17th International Conference on Parallel Architectures and Compilation Techniques, pp. 220\u2013229. ACM (2008)","DOI":"10.1145\/1454115.1454146"},{"key":"6_CR7","doi-asserted-by":"crossref","unstructured":"Tian, K., Jiang, Y., Shen, X.: A study on optimally co-scheduling jobs of different lengths on chip multiprocessors. In: Proceedings of the 6th ACM Conference on Computing Frontiers, pp. 41\u201350. ACM (2009)","DOI":"10.1145\/1531743.1531752"},{"key":"6_CR8","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"201","DOI":"10.1007\/978-3-642-11515-8_16","volume-title":"High Performance Embedded Architectures and Compilers","author":"Y Jiang","year":"2010","unstructured":"Jiang, Y., Tian, K., Shen, X.: Combining locality analysis with online proactive job co-scheduling in chip multiprocessors. In: Patt, Y.N., Foglia, P., Duesterwald, E., Faraboschi, P., Martorell, X. (eds.) HiPEAC 2010. LNCS, vol. 5952, pp. 201\u2013215. Springer, Heidelberg (2010)"},{"key":"6_CR9","doi-asserted-by":"crossref","unstructured":"Fedorova, A., Seltzer, M., Smith, M.D.: Improving performance isolation on chip multiprocessors via an operating system scheduler. In: Proceedings of the 16th International Conference on Parallel Architecture and Compilation Techniques, pp. 25\u201338. IEEE Computer Society (2007)","DOI":"10.1109\/PACT.2007.4336197"},{"key":"6_CR10","doi-asserted-by":"crossref","unstructured":"El-Moursy, A., Garg, R., Albonesi, D.H., et al.: Compatible phase co-scheduling on a CMP of multi-threaded processors. In: Proceedings of the 20th International Parallel and Distributed Processing Symposium (IPDPS 2006), p. 10. IEEE (2006)","DOI":"10.1109\/IPDPS.2006.1639376"},{"key":"6_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"87","DOI":"10.1007\/978-3-319-09967-5_5","volume-title":"Languages and Compilers for Parallel Computing - Testing","author":"D Grewe","year":"2014","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.F.P.: OpenCL task partitioning in the presence of GPU contention. In: Ca\u1e63caval, C., Montesinos-Ortego, P. (eds.) LCPC 2013 - Testing. LNCS, vol. 8664, pp. 87\u2013101. Springer, Heidelberg (2014)"},{"key":"6_CR12","doi-asserted-by":"crossref","unstructured":"Luk, C.K., Hong, S., Qilin, K.H.: Exploiting parallelism on heterogeneous multiprocessors with adaptive mapping. In: 42nd Annual IEEE\/ACM International Symposium on Microarchitecture (MICRO-42), pp. 45\u201355. IEEE (2009)","DOI":"10.1145\/1669112.1669121"},{"key":"6_CR13","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"286","DOI":"10.1007\/978-3-642-19861-8_16","volume-title":"Compiler Construction","author":"D Grewe","year":"2011","unstructured":"Grewe, D., O\u2019Boyle, M.F.P.: A static task partitioning approach for heterogeneous systems using OpenCL. In: Knoop, J. (ed.) CC 2011. LNCS, vol. 6601, pp. 286\u2013305. Springer, Heidelberg (2011)"},{"key":"6_CR14","doi-asserted-by":"crossref","unstructured":"Ravi, V.T., Ma, W., Chiu, D., et al.: Compiler and runtime support for enabling generalized reduction computations on heterogeneous parallel configurations. In: Proceedings of the 24th ACM International Conference on Supercomputing, pp. 137\u2013146. ACM (2010)","DOI":"10.1145\/1810085.1810106"},{"key":"6_CR15","unstructured":"Mekkat, V., Holey, A., Yew, P.C., et al.: Managing shared last-level cache in a heterogeneous multicore processor. In: Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques, pp. 225\u2013234. IEEE Press (2013)"},{"key":"6_CR16","doi-asserted-by":"crossref","unstructured":"Liu, Y., Zhang, E.Z., Shen, X.: A cross-input adaptive framework for GPU program optimizations. In: IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2009), pp. 1\u201310. IEEE (2009)","DOI":"10.1109\/IPDPS.2009.5160988"},{"key":"6_CR17","doi-asserted-by":"crossref","unstructured":"Tuck, N., Tullsen, D.M.: Initial observations of the simultaneous multithreading Pentium 4 processor. In: Proceedings of the 12th International Conference on Parallel Architectures and Compilation Techniques (PACT 2003), pp. 26\u201334. IEEE (2003)","DOI":"10.1109\/PACT.2003.1237999"},{"issue":"5","key":"6_CR18","doi-asserted-by":"publisher","first-page":"245","DOI":"10.1145\/780822.781159","volume":"38","author":"C Ding","year":"2003","unstructured":"Ding, C., Zhong, Y.: Predicting whole-program locality through reuse distance analysis. ACM SIGPLAN Not. (ACM) 38(5), 245\u2013257 (2003)","journal-title":"ACM SIGPLAN Not. (ACM)"},{"issue":"4","key":"6_CR19","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1145\/2082156.2082183","volume":"39","author":"J Fousek","year":"2011","unstructured":"Fousek, J., Filipovi, J., Madzin, M.: Automatic fusions of CUDA-GPU kernels for parallel map. ACM SIGARCH Comput. Archit. News 39(4), 98\u201399 (2011)","journal-title":"ACM SIGARCH Comput. Archit. News"},{"key":"6_CR20","doi-asserted-by":"crossref","unstructured":"Wang, G., Lin, Y.S., Yi, W.: Kernel fusion: an effective method for better power efficiency on multithreaded GPU. In: 2010 IEEE\/ACM International Conference on Cyber, Physical and Social Computing (CPSCom), Green Computing and Communications (GreenCom), pp. 344\u2013350. IEEE (2010)","DOI":"10.1109\/GreenCom-CPSCom.2010.102"},{"key":"6_CR21","doi-asserted-by":"crossref","unstructured":"Wu, H., Diamos, G., Wang, J., et al.: Optimizing data warehousing applications for GPUs using kernel fusion, fission. In: 2012 IEEE 26th International Parallel and Distributed Processing Symposium Workshops & PhD Forum (IPDPSW), pp. 2433\u20132442. IEEE (2012)","DOI":"10.1109\/IPDPSW.2012.300"},{"key":"6_CR22","doi-asserted-by":"crossref","unstructured":"Aila, T., Laine, S.: Understanding the efficiency of ray traversal on GPUs. In: Proceedings of the Conference on High Performance Graphics, pp. 145\u2013149. ACM (2009)","DOI":"10.1145\/1572769.1572792"},{"key":"6_CR23","doi-asserted-by":"crossref","unstructured":"Chen, L., Villa, O., Krishnamoorthy, S., et al.: Dynamic load balancing on single-and multi-GPU systems. In: 2010 IEEE International Symposium on Parallel and Distributed Processing (IPDPS), pp. 1\u201312. IEEE (2010)","DOI":"10.1109\/IPDPS.2010.5470413"},{"key":"6_CR24","doi-asserted-by":"crossref","unstructured":"Gupta, K., Stuart, J.A., Owens, J.D.: A study of persistent threads style GPU programming for GPGPU workloads. In: Innovative Parallel Computing (InPar), pp. 1\u201314. IEEE (2012)","DOI":"10.1109\/InPar.2012.6339596"},{"key":"6_CR25","doi-asserted-by":"crossref","unstructured":"Xiao, S., Feng, W.: Inter-block GPU communication via fast barrier synchronization. In: 2010 IEEE International Symposium on Parallel and Distributed Processing (IPDPS), pp. 1\u201312. IEEE (2010)","DOI":"10.1109\/IPDPS.2010.5470477"},{"key":"6_CR26","unstructured":"http:\/\/unixhelp.ed.ac.uk\/CGI\/man-cgi?sched_setscheduler+2"},{"key":"6_CR27","doi-asserted-by":"crossref","unstructured":"Zahedi, S.M., Lee, B.C.: REF: resource elasticity fairness with sharing incentives for multiprocessors. In: Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS) (2014)","DOI":"10.1145\/2541940.2541962"},{"key":"6_CR28","doi-asserted-by":"crossref","unstructured":"Mars, J., Tang, L., Hundt, R.: Whare-Map: heterogeneity in homogeneous warehouse-scale computers. In: Proceedings of the 40th Annual International Symposium on Computer Architecture (ISCA), pp. 1\u201312 (2013)","DOI":"10.1145\/2485922.2485975"},{"issue":"5","key":"6_CR29","doi-asserted-by":"publisher","first-page":"203","DOI":"10.1145\/1837853.1693482","volume":"45","author":"EZ Zhang","year":"2010","unstructured":"Zhang, E.Z., Jiang, Y., Shen, X.: Does cache sharing on modern CMP matter to the performance of contemporary multithreaded programs? ACM Sigplan Not. (ACM) 45(5), 203\u2013212 (2010)","journal-title":"ACM Sigplan Not. (ACM)"},{"key":"6_CR30","doi-asserted-by":"crossref","unstructured":"Chang, J., Sohi, G.S.: Cooperative cache partitioning for chip multiprocessors. In: Proceedings of the 21st Annual International Conference on Supercomputing, pp. 242\u2013252. ACM (2007)","DOI":"10.1145\/1274971.1275005"},{"key":"6_CR31","doi-asserted-by":"crossref","unstructured":"Rafique, N., Lim, W.T., Thottethodi, M.: Architectural support for operating system-driven CMP cache management. In: Proceedings of the 15th International Conference on Parallel Architectures and Compilation Techniques, pp. 2\u201312. ACM (2006)","DOI":"10.1145\/1152154.1152160"},{"key":"6_CR32","doi-asserted-by":"crossref","unstructured":"Suh, G.E., Devadas, S., Rudolph, L.: A new memory monitoring scheme for memory-aware scheduling and partitioning. In: Proceedings of the Eighth International Symposium on High-Performance Computer Architecture, pp. 117\u2013128. IEEE (2002)","DOI":"10.1109\/HPCA.2002.995703"},{"key":"6_CR33","doi-asserted-by":"crossref","unstructured":"Qureshi, M.K., Patt, Y.N.: Utility-based cache partitioning: a low-overhead, high-performance, runtime mechanism to partition shared caches. In: Proceedings of the 39th Annual IEEE\/ACM International Symposium on Microarchitecture, pp. 423\u2013432. IEEE Computer Society (2006)","DOI":"10.1109\/MICRO.2006.49"}],"container-title":["Lecture Notes in Computer Science","Languages and Compilers for Parallel Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-17473-0_6","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,27]],"date-time":"2025-05-27T18:35:33Z","timestamp":1748370933000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-17473-0_6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015]]},"ISBN":["9783319174723","9783319174730"],"references-count":33,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-17473-0_6","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2015]]},"assertion":[{"value":"1 May 2015","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}