{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2022,4,1]],"date-time":"2022-04-01T08:35:42Z","timestamp":1648802142681},"reference-count":30,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2017,6,15]],"date-time":"2017-06-15T00:00:00Z","timestamp":1497484800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"National Natural Science Foundation of China (CN)","award":["61379035"],"award-info":[{"award-number":["61379035"]}]},{"name":"National Natural Science Foundation of Zhejiang Province, China","award":["LY14F020005","LQ14F02001"],"award-info":[{"award-number":["LY14F020005","LQ14F02001"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["J Supercomput"],"published-print":{"date-parts":[[2017,12]]},"DOI":"10.1007\/s11227-017-2093-8","type":"journal-article","created":{"date-parts":[[2017,6,15]],"date-time":"2017-06-15T11:15:40Z","timestamp":1497525340000},"page":"5414-5439","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Enable back memory and global synchronization on LLC buffer"],"prefix":"10.1007","volume":"73","author":[{"given":"Licheng","family":"Yu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yulong","family":"Pei","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianzhou","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xueqing","family":"Lou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Minghui","family":"Wu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tiefei","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,6,15]]},"reference":[{"key":"2093_CR1","doi-asserted-by":"publisher","unstructured":"Agarwal N, Nellans D, Ebrahimi E, Wenisch TF, Danskin J, Keckler SW (2016) Selective gpu caches to eliminate cpu-gpu hw cache coherence. In: 2016 IEEE International Symposium on High Performance Computer Architecture (HPCA), pp 494\u2013506, doi:\n                        10.1109\/HPCA.2016.7446089","DOI":"10.1109\/HPCA.2016.7446089"},{"key":"2093_CR2","doi-asserted-by":"publisher","unstructured":"Al-Saber N, Kulkarni M (2015) Semcache++: Semantics-aware caching for efficient multi-gpu offloading. In: Proceedings of the 29th ACM on International Conference on Supercomputing, ACM, New York, ICS \u201915, pp 79\u201388, doi:\n                        10.1145\/2751205.2751210","DOI":"10.1145\/2751205.2751210"},{"key":"2093_CR3","doi-asserted-by":"publisher","unstructured":"Amini M, Coelho F, Irigoin F, Keryell R (2013) Static Compilation Analysis for Host-Accelerator Communication Optimization, Springer Berlin Heidelberg, Heidelberg, pp 237\u2013251. doi:\n                        10.1007\/978-3-642-36036-7_16","DOI":"10.1007\/978-3-642-36036-7_16"},{"key":"2093_CR4","doi-asserted-by":"publisher","unstructured":"Asmussen N, V\u00f6lp M, N\u00f6then B, H\u00e4rtig H, Fettweis G (2016) M3: A hardware\/operating-system co-design to tame heterogeneous manycores. In: Proceedings of the Twenty-First International Conference on Architectural Support for Programming Languages and Operating Systems, ACM, New York, ASPLOS \u201916, pp 189\u2013203, doi:\n                        10.1145\/2872362.2872371","DOI":"10.1145\/2872362.2872371"},{"key":"2093_CR5","doi-asserted-by":"publisher","unstructured":"Bakhoda A, Yuan GL, Fung WWL, Wong H, Aamodt TM (2009) Analyzing cuda workloads using a detailed gpu simulator. In: Performance Analysis of Systems and Software, 2009. ISPASS 2009. IEEE International Symposium on, pp 163\u2013174, doi:\n                        10.1109\/ISPASS.2009.4919648","DOI":"10.1109\/ISPASS.2009.4919648"},{"issue":"2","key":"2093_CR6","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/2024716.2024718","volume":"39","author":"N Binkert","year":"2011","unstructured":"Binkert N, Beckmann B, Black G, Reinhardt SK, Saidi A, Basu A, Hestness J, Hower DR, Krishna T, Sardashti S, Sen R, Sewell K, Shoaib M, Vaish N, Hill MD, Wood DA (2011) The gem5 simulator. SIGARCH Comput Archit News 39(2):1\u20137. doi:\n                        10.1145\/2024716.2024718","journal-title":"SIGARCH Comput Archit News"},{"key":"2093_CR7","doi-asserted-by":"publisher","unstructured":"Dubach C, Cheng P, Rabbah R, Bacon DF, Fink SJ (2012) Compiling a high-level language for gpus: (via language support for architectures and compilers). In: Proceedings of the 33rd ACM SIGPLAN Conference on Programming Language Design and Implementation, ACM, New York, PLDI \u201912, pp 1\u201312, doi:\n                        10.1145\/2254064.2254066","DOI":"10.1145\/2254064.2254066"},{"key":"2093_CR8","unstructured":"Group KOW et al. (2008) The opencl specification. 1(29):8"},{"key":"2093_CR9","doi-asserted-by":"publisher","unstructured":"Ham TJ, Arag\u00f3n JL, Martonosi M (2015) Desc: Decoupled supply-compute communication management for heterogeneous architectures. In: Proceedings of the 48th International Symposium on Microarchitecture, ACM, New York, MICRO-48, pp 191\u2013203, doi:\n                        10.1145\/2830772.2830800","DOI":"10.1145\/2830772.2830800"},{"key":"2093_CR10","unstructured":"Harish P, Narayanan PJ (2007) High Performance Computing \u2013 HiPC 2007: 14th International Conference, Goa, India, December 18-21, 2007. Proceedings, Springer Berlin Heidelberg, Heidelberg, chap Accelerating Large Graph Algorithms on the GPU Using CUDA, pp 197\u2013208"},{"key":"2093_CR11","doi-asserted-by":"publisher","unstructured":"Hayashi A, Ishizaki K, Koblents G, Sarkar V (2015) Machine-learning-based performance heuristics for runtime cpu\/gpu selection. In: Proceedings of the Principles and Practices of Programming on The Java Platform, ACM, New York, PPPJ \u201915, pp 27\u201336, doi:\n                        10.1145\/2807426.2807429","DOI":"10.1145\/2807426.2807429"},{"key":"2093_CR12","doi-asserted-by":"publisher","unstructured":"Ishizaki K, Hayashi A, Koblents G, Sarkar V (2015) Compiling and optimizing java 8 programs for gpu execution. In: 2015 International Conference on Parallel Architecture and Compilation (PACT), pp 419\u2013431, doi:\n                        10.1109\/PACT.2015.46","DOI":"10.1109\/PACT.2015.46"},{"key":"2093_CR13","doi-asserted-by":"publisher","unstructured":"Jablin TB, Prabhu P, Jablin JA, Johnson NP, Beard SR, August DI (2011) Automatic cpu-gpu communication management and optimization. In: Proceedings of the 32Nd ACM SIGPLAN Conference on Programming Language Design and Implementation, ACM, New York, PLDI \u201911, pp 142\u2013151, doi:\n                        10.1145\/1993498.1993516","DOI":"10.1145\/1993498.1993516"},{"key":"2093_CR14","doi-asserted-by":"publisher","unstructured":"Jablin TB, Jablin JA, Prabhu P, Liu F, August DI (2012) Dynamically managed data for cpu-gpu architectures. In: Proceedings of the Tenth International Symposium on Code Generation and Optimization, ACM, New York, CGO \u201912, pp 165\u2013174, doi:\n                        10.1145\/2259016.2259038","DOI":"10.1145\/2259016.2259038"},{"key":"2093_CR15","unstructured":"Kato S, McThrow M, Maltzahn C, Brandt S (2012) Gdev: First-class gpu resource management in the operating system. Presented as part of the 2012 USENIX Annual Technical Conference (USENIX ATC 12). USENIX, Boston, pp 401\u2013412"},{"key":"2093_CR16","doi-asserted-by":"publisher","unstructured":"Kato S, Aumiller J, Brandt S (2013) Zero-copy i\/o processing for low-latency gpu computing. In: Proceedings of the ACM\/IEEE 4th International Conference on Cyber-Physical Systems, ACM, New York, ICCPS \u201913, pp 170\u2013178, doi:\n                        10.1145\/2502524.2502548\n                        \n                    .","DOI":"10.1145\/2502524.2502548"},{"key":"2093_CR17","doi-asserted-by":"publisher","unstructured":"Lee H, Brown KJ, Sujeeth AK, Rompf T, Olukotun K (2014) Locality-aware mapping of nested parallel patterns on gpus. In: Proceedings of the 47th Annual IEEE\/ACM International Symposium on Microarchitecture, IEEE Computer Society, Washington, MICRO-47, pp 63\u201374, doi:\n                        10.1109\/MICRO.2014.23\n                        \n                    .","DOI":"10.1109\/MICRO.2014.23"},{"key":"2093_CR18","doi-asserted-by":"crossref","unstructured":"Licheng Y, Yulong P, Tianzhou C, Xueqing L, Minghui W, Tiefei Z (2016) LLC buffer for arbitrary data sharing in heterogeneous systems. In: High Performance Computing and Communications; IEEE 14th International Conference on Smart City; IEEE 2nd International Conference on Data Science and Systems (HPCC\/SmartCity\/DSS), 2016 IEEE 18th International Conference on, IEEE, pp 260\u2013267","DOI":"10.1109\/HPCC-SmartCity-DSS.2016.0046"},{"key":"2093_CR19","doi-asserted-by":"publisher","unstructured":"Luo L, Wong M, Hwu Wm (2010) An effective gpu implementation of breadth-first search. In: Proceedings of the 47th Design Automation Conference, ACM, New York, DAC \u201910, pp 52\u201355, doi:\n                        10.1145\/1837274.1837289","DOI":"10.1145\/1837274.1837289"},{"key":"2093_CR20","doi-asserted-by":"publisher","unstructured":"Margiolas C, O\u2019Boyle MFP (2014) Portable and transparent host-device communication optimization for gpgpu environments. In: Proceedings of Annual IEEE\/ACM International Symposium on Code Generation and Optimization, ACM, New York, CGO \u201914, pp 55:55\u201355:65, doi:\n                        10.1145\/2544137.2544156","DOI":"10.1145\/2544137.2544156"},{"key":"2093_CR21","unstructured":"Nvidia C (2008) Cuda programming guide"},{"key":"2093_CR22","doi-asserted-by":"publisher","unstructured":"Pai S, Govindarajan R, Thazhuthaveetil MJ (2012) Fast and efficient automatic memory management for gpus using compiler-assisted runtime coherence scheme. In: Proceedings of the 21st International Conference on Parallel Architectures and Compilation Techniques, ACM, New York, PACT \u201912, pp 33\u201342, doi:\n                        10.1145\/2370816.2370824","DOI":"10.1145\/2370816.2370824"},{"key":"2093_CR23","doi-asserted-by":"publisher","unstructured":"Phothilimthana PM, Ansel J, Ragan-Kelley J, Amarasinghe S (2013) Portable performance on heterogeneous architectures. In: Proceedings of the Eighteenth International Conference on Architectural Support for Programming Languages and Operating Systems, ACM, New York, ASPLOS \u201913, pp 431\u2013444, doi:\n                        10.1145\/2451116.2451162\n                        \n                    .","DOI":"10.1145\/2451116.2451162"},{"key":"2093_CR24","doi-asserted-by":"publisher","unstructured":"Ren B, Ravi N, Yang Y, Feng M, Agrawal G, Chakradhar S (2016) Automatic and Efficient Data Host-Device Communication for Many-Core Coprocessors, Springer International Publishing, Cham, pp 173\u2013190. doi:\n                        10.1007\/978-3-319-29778-1_11","DOI":"10.1007\/978-3-319-29778-1_11"},{"key":"2093_CR25","unstructured":"Richards M (1997) Backtracking algorithms in MCPL using bit patterns and recursion. Citeseer"},{"key":"2093_CR26","unstructured":"Stratton JA, Rodrigues C, Sung I, Obeid N, Chang L, Anssari N, Liu G, Hwu W (2012) The parboil technical report. Tech. rep., IMPACT Technical Report (IMPACT-12-01), University of Illinois Urbana-Champaign"},{"key":"2093_CR27","unstructured":"Thoziyoor S, Muralimanohar N, Ahn JH, Jouppi NP (2008) Cacti 5.1. Tech. rep., Technical Report HPL-2008-20, HP Labs"},{"key":"2093_CR28","doi-asserted-by":"publisher","unstructured":"Wang Z, Grewe D, O\u2019boyle MFP (2014) Automatic and portable mapping of data parallel programs to opencl for gpu-based heterogeneous systems. ACM Trans Archit Code Optim 11(4):42:1\u201342:26, doi:\n                        10.1145\/2677036","DOI":"10.1145\/2677036"},{"key":"2093_CR29","unstructured":"Wolf C, Glaser J, Kepler J (2013) Yosys-a free verilog synthesis suite. In: Proceedings of the 21st Austrian Workshop on Microelectronics (Austrochip)"},{"key":"2093_CR30","doi-asserted-by":"publisher","unstructured":"Xiao S, c\u00a0Feng W (2010) Inter-block gpu communication via fast barrier synchronization. In: Parallel Distributed Processing (IPDPS), 2010 IEEE International Symposium on, pp 1\u201312, doi:\n                        10.1109\/IPDPS.2010.5470477","DOI":"10.1109\/IPDPS.2010.5470477"}],"container-title":["The Journal of Supercomputing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11227-017-2093-8\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-2093-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11227-017-2093-8.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,10,27]],"date-time":"2017-10-27T14:09:25Z","timestamp":1509113365000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11227-017-2093-8"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,6,15]]},"references-count":30,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2017,12]]}},"alternative-id":["2093"],"URL":"https:\/\/doi.org\/10.1007\/s11227-017-2093-8","relation":{},"ISSN":["0920-8542","1573-0484"],"issn-type":[{"value":"0920-8542","type":"print"},{"value":"1573-0484","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,6,15]]}}}