{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,9]],"date-time":"2024-09-09T17:48:18Z","timestamp":1725904098328},"publisher-location":"Cham","reference-count":11,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319619811"},{"type":"electronic","value":"9783319619828"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-61982-8_14","type":"book-chapter","created":{"date-parts":[[2017,7,13]],"date-time":"2017-07-13T05:22:20Z","timestamp":1499923340000},"page":"135-145","source":"Crossref","is-referenced-by-count":1,"title":["Implementation and Evaluation of NAS Parallel CG Benchmark on GPU Cluster with Proprietary Interconnect TCA"],"prefix":"10.1007","author":[{"given":"Kazuya","family":"Matsumoto","sequence":"first","affiliation":[]},{"given":"Norihisa","family":"Fujita","sequence":"additional","affiliation":[]},{"given":"Toshihiro","family":"Hanawa","sequence":"additional","affiliation":[]},{"given":"Taisuke","family":"Boku","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,7,14]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Bailey, D.H., Schreiber, R.S., Simon, H.D., Venkatakrishnan, V., Weeratunga, S.K., Barszcz, E., Barton, J.T., Browning, D.S., Carter, R.L., Dagum, L., Fatoohi, R.A., Frederickson, P.O., Lasinski, T.A.: The NAS parallel benchmarks - summary and preliminary results. In: Proceedings of SC 1991, pp. 158\u2013165 (1991)","DOI":"10.1145\/125826.125925"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Grewe, D., Wang, Z., O\u2019Boyle, M.F.P.: Portable mapping of data parallel programs to OpenCL for heterogeneous systems. In: Proceedings of CGO 2013, pp. 1\u201310. IEEE (2013)","DOI":"10.1109\/CGO.2013.6494993"},{"key":"14_CR3","doi-asserted-by":"crossref","unstructured":"Hanawa, T., Fujii, H., Fujita, N., Odajima, T., Matsumoto, K., Boku, T.: Evaluation of FFT for GPU cluster using tightly coupled accelerators architecture. In: Proceedings of Cluster 2015, pp. 635\u2013641. IEEE (2015)","DOI":"10.1109\/CLUSTER.2015.113"},{"key":"14_CR4","doi-asserted-by":"crossref","unstructured":"Hanawa, T., Kodama, Y., Boku, T., Sato, M.: Tightly coupled accelerators architecture for minimizing communication latency among accelerators. In: Proceedings of IPDPSW 2013, pp. 1030\u20131039. IEEE (2013)","DOI":"10.1109\/IPDPSW.2013.226"},{"issue":"4","key":"14_CR5","doi-asserted-by":"crossref","first-page":"3","DOI":"10.1145\/2693714.2693716","volume":"42","author":"Y Kodama","year":"2014","unstructured":"Kodama, Y., Hanawa, T., Boku, T., Sato, M.: PEACH2: an FPGA-based PCIe network device for tightly coupled accelerators. ACM SIGARCH Comput. Architect. News 42(4), 3\u20138 (2014)","journal-title":"ACM SIGARCH Comput. Architect. News"},{"key":"14_CR6","doi-asserted-by":"crossref","unstructured":"Lee, S., Vetter, J.S.: Early evaluation of directive-based GPU programming models for productive exascale computing. In: Proceedings of SC 2012 (2012)","DOI":"10.1109\/SC.2012.51"},{"key":"14_CR7","doi-asserted-by":"crossref","unstructured":"Matsumoto, K., Hanawa, T., Kodama, Y., Fujii, H., Boku, T.: Implementation of CG method on GPU cluster with proprietary interconnect TCA for GPU direct communication. In: Proceedings of IPDPSW 2015, pp. 647\u2013655. IEEE (2015)","DOI":"10.1109\/IPDPSW.2015.102"},{"key":"14_CR8","unstructured":"NVIDIA: NVIDIA GPUDirect. \nhttps:\/\/developer.nvidia.com\/gpudirect\n\n. Accessed 25 Aug 2016"},{"key":"14_CR9","unstructured":"Panda, D.K.: MVAPICH2-GDR (MVAPICH2 with GPUDirect RDMA). \nhttp:\/\/mvapich.cse.ohio-state.edu\/overview\/\n\n. Accessed 25 Aug 2016"},{"issue":"4","key":"14_CR10","doi-asserted-by":"crossref","first-page":"23","DOI":"10.1145\/1964218.1964223","volume":"38","author":"SJ Pennycook","year":"2011","unstructured":"Pennycook, S.J., Hammond, S.D., Jarvis, S.A., Mudalige, G.R.: Performance analysis of a hybrid MPI\/CUDA implementation of the NAS-LU benchmark. SIGMETRICS Perform. Eval. Rev. 38(4), 23\u201329 (2011)","journal-title":"SIGMETRICS Perform. Eval. Rev."},{"key":"14_CR11","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1007\/978-3-319-17473-0_5","volume-title":"Languages and Compilers for Parallel Computing","author":"R Xu","year":"2015","unstructured":"Xu, R., Tian, X., Chandrasekaran, S., Yan, Y., Chapman, B.: NAS parallel benchmarks for GPGPUs using a directive-based programming model. In: Brodman, J., Tu, P. (eds.) LCPC 2014. LNCS, vol. 8967, pp. 67\u201381. Springer, Cham (2015). doi:\n10.1007\/978-3-319-17473-0_5"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing for Computational Science \u2013 VECPAR 2016"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-61982-8_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2017,7,13]],"date-time":"2017-07-13T05:26:05Z","timestamp":1499923565000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-61982-8_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319619811","9783319619828"],"references-count":11,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-61982-8_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2017]]}}}