{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,31]],"date-time":"2026-01-31T06:24:07Z","timestamp":1769840647788,"version":"3.49.0"},"publisher-location":"Cham","reference-count":27,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319729701","type":"print"},{"value":"9783319729718","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,12,23]],"date-time":"2017-12-23T00:00:00Z","timestamp":1513987200000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-319-72971-8_1","type":"book-chapter","created":{"date-parts":[[2017,12,22]],"date-time":"2017-12-22T03:44:54Z","timestamp":1513914294000},"page":"3-21","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":13,"title":["Evaluating On-Node GPU Interconnects for\u00a0Deep Learning Workloads"],"prefix":"10.1007","author":[{"given":"Nathan R.","family":"Tallent","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nitin A.","family":"Gawande","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Charles","family":"Siegel","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Abhinav","family":"Vishnu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Adolfy","family":"Hoisie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,12,23]]},"reference":[{"key":"1_CR1","doi-asserted-by":"crossref","unstructured":"Gawande, N.A., Landwehr, J.B., Daily, J.A., Tallent, N.R., Vishnu, A., Kerbyson, D.J.: Scaling deep learning workloads: NVIDIA DGX-1\/Pascal and Intel Knights Landing. In: 2017 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), pp. 399\u2013408, May 2017","DOI":"10.1109\/IPDPSW.2017.36"},{"issue":"2","key":"1_CR2","doi-asserted-by":"crossref","first-page":"7","DOI":"10.1109\/MM.2017.37","volume":"37","author":"D Foley","year":"2017","unstructured":"Foley, D., Danskin, J.: Ultra-performance Pascal GPU and NVLink interconnect. IEEE Micro 37(2), 7\u201317 (2017)","journal-title":"IEEE Micro"},{"issue":"2","key":"1_CR3","doi-asserted-by":"crossref","first-page":"34","DOI":"10.1109\/MM.2016.25","volume":"36","author":"A Sodani","year":"2016","unstructured":"Sodani, A., Gramunt, R., Corbal, J., Kim, H.S., Vinod, K., Chinthamani, S., Hutsell, S., Agarwal, R., Liu, Y.C.: Knights landing: second-generation Intel Xeon Phi product. IEEE Micro 36(2), 34\u201346 (2016)","journal-title":"IEEE Micro"},{"key":"1_CR4","unstructured":"Cirrascale: The GX8 series multi-device peering platform, June 2016. http:\/\/www.cirrascale.com\/documents\/datasheets\/Cirrascale_GX8Series_Datasheet_CM080C.pdf"},{"key":"1_CR5","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: Pereira, F., Burges, C., Bottou, L., Weinberger, K. (eds.) Advances in Neural Information Processing Systems, vol. 25, pp. 1097\u20131105. Curran Associates, Inc., Red Hook (2012)"},{"key":"1_CR6","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"1_CR7","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"1_CR8","unstructured":"Cirrascale: Cirrascale SR3615 PCIe switch riser, July 2015"},{"key":"1_CR9","unstructured":"Cirrascale: Scaling GPU compute performance (2015). http:\/\/www.cirrascale.com\/documents\/whitepapers\/Cirrascale_ScalingGPUCompute_WP_M987_REVA.pdf"},{"key":"1_CR10","unstructured":"Luehr, N.: Fast multi-GPU collectives with NCCL, April 2016. https:\/\/devblogs.nvidia.com\/parallelforall\/fast-multi-gpu-collectives-nccl\/"},{"key":"1_CR11","unstructured":"NVIDIA: NCCL: NVIDIA collective communications library, August 2017. https:\/\/developer.nvidia.com\/nccl"},{"key":"1_CR12","doi-asserted-by":"crossref","unstructured":"Awan, A.A., Hamidouche, K., Venkatesh, A., Panda, D.K.: Efficient large message broadcast using NCCL and CUDA-aware MPI for deep learning. In: Proceedings of the 23rd European MPI Users\u2019 Group Meeting, EuroMPI 2016, New York, NY, USA, pp. 15\u201322. ACM (2016)","DOI":"10.1145\/2966884.2966912"},{"key":"1_CR13","unstructured":"Jeaugey, S.: Optimized inter-GPU collective operations with NCCL, May 2017. http:\/\/on-demand-gtc.gputechconf.com\/gtc-quicklink\/8Bdyh"},{"issue":"2","key":"1_CR14","doi-asserted-by":"crossref","first-page":"117","DOI":"10.1016\/j.jpdc.2008.09.002","volume":"69","author":"P Patarasuk","year":"2009","unstructured":"Patarasuk, P., Yuan, X.: Bandwidth optimal all-reduce algorithms for clusters of workstations. J. Parallel Distrib. Comput. 69(2), 117\u2013124 (2009)","journal-title":"J. Parallel Distrib. Comput."},{"issue":"3","key":"1_CR15","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"1_CR16","unstructured":"Berkeley Vision and Learning Center. Berkeley vision and learning center: Caffe (2016). http:\/\/caffe.berkeleyvision.org"},{"key":"1_CR17","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., Darrell, T.: Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093 (2014)","DOI":"10.1145\/2647868.2654889"},{"key":"1_CR18","unstructured":"Berkeley Vision and Learning Center: Convolutional architecture for fast feature embedding (Caffe) (2016). https:\/\/github.com\/BVLC\/caffe\/"},{"key":"1_CR19","unstructured":"NVIDIA: Convolutional architecture for fast feature embedding (Caffe) (2017). https:\/\/github.com\/NVIDIA\/caffe"},{"key":"1_CR20","unstructured":"Ben-Nun, T.: MGBench: multi-GPU computing benchmark suite, February 2016. https:\/\/github.com\/tbennun\/mgbench"},{"key":"1_CR21","unstructured":"Cirrascale: Cirrascale SR3514: Unexpected performance inequality. Technical Brief M901A\u2013092014"},{"key":"1_CR22","doi-asserted-by":"crossref","unstructured":"Adolf, R., Rama, S., Reagen, B., Wei, G.Y., Brooks, D.: Fathom: reference workloads for modern deep learning methods. In: 2016 IEEE International Symposium on Workload Characterization (IISWC), pp. 1\u201310, September 2016","DOI":"10.1109\/IISWC.2016.7581275"},{"key":"1_CR23","doi-asserted-by":"crossref","unstructured":"Christensen, C., Fogal, T., Luehr, N., Woolley, C.: Topology-aware image compositing using NVLink. In: 2016 IEEE 6th Symposium on Large Data Analysis and Visualization (LDAV), pp. 93\u201394, October 2016","DOI":"10.1109\/LDAV.2016.7874334"},{"key":"1_CR24","doi-asserted-by":"crossref","unstructured":"Shams, S., Platania, R., Lee, K., Park, S.J.: Evaluation of deep learning frameworks over different HPC architectures. In: Proceedings of the IEEE 37th International Conference on Distributed Computing Systems, pp. 1389\u20131396, June 2017","DOI":"10.1109\/ICDCS.2017.259"},{"issue":"4","key":"1_CR25","doi-asserted-by":"crossref","first-page":"9","DOI":"10.1145\/2693714.2693717","volume":"42","author":"S Nomura","year":"2014","unstructured":"Nomura, S., Mitsuishi, T., Suzuki, J., Hayashi, Y., Kan, M., Amano, H.: Performance analysis of the multi-GPU system with expether. SIGARCH Comput. Archit. News 42(4), 9\u201314 (2014)","journal-title":"SIGARCH Comput. Archit. News"},{"key":"1_CR26","doi-asserted-by":"crossref","unstructured":"Ben-Nun, T., Sutton, M., Pai, S., Pingali, K.: Groute: an asynchronous multi-GPU programming model for irregular computations. In: Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, New\u00a0York, NY, USA, pp. 235\u2013248. ACM (2017)","DOI":"10.1145\/3018743.3018756"},{"key":"1_CR27","doi-asserted-by":"crossref","unstructured":"Awan, A.A., Chu, C.H., Subramoni, H., Panda, D.K.: Optimized broadcast for deep learning workloads on dense-GPU infiniband clusters: MPI or NCCL? arXiv preprint arXiv:1707.09414 (2017)","DOI":"10.1145\/3236367.3236381"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing Systems. Performance Modeling, Benchmarking, and Simulation"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-72971-8_1","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,10,8]],"date-time":"2019-10-08T09:29:31Z","timestamp":1570526971000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-72971-8_1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,12,23]]},"ISBN":["9783319729701","9783319729718"],"references-count":27,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-72971-8_1","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,12,23]]}}}