{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,4]],"date-time":"2025-12-04T10:01:34Z","timestamp":1764842494448,"version":"3.40.3"},"publisher-location":"Cham","reference-count":39,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030801250"},{"type":"electronic","value":"9783030801267"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-80126-7_35","type":"book-chapter","created":{"date-parts":[[2021,7,6]],"date-time":"2021-07-06T11:11:23Z","timestamp":1625569883000},"page":"473-491","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":15,"title":["Hierarchical Roofline Performance Analysis for Deep Learning Applications"],"prefix":"10.1007","author":[{"given":"Charlene","family":"Yang","sequence":"first","affiliation":[]},{"given":"Yunsong","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Thorsten","family":"Kurth","sequence":"additional","affiliation":[]},{"given":"Steven","family":"Farrell","sequence":"additional","affiliation":[]},{"given":"Samuel","family":"Williams","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,7,7]]},"reference":[{"key":"35_CR1","unstructured":"apex.amp. Accessed 15 Oct 2020"},{"key":"35_CR2","unstructured":"CUDA C++ wmma API"},{"key":"35_CR3","unstructured":"CUDA cuBLAS Library"},{"key":"35_CR4","unstructured":"Deep Learning Climate Segmentation Benchmark"},{"key":"35_CR5","unstructured":"Deterministic Profiling for TensorFlow"},{"key":"35_CR6","unstructured":"Empirical Roofline Toolkit (ERT). Accessed 15 Oct 2020"},{"key":"35_CR7","unstructured":"MLPerf Benchmark"},{"key":"35_CR8","unstructured":"NERSC Roofline Model Documentation"},{"key":"35_CR9","unstructured":"Nsight compute cli - metric comparison. Accessed 15 Oct 2020"},{"key":"35_CR10","unstructured":"NVIDIA CUPTI API reference guide"},{"key":"35_CR11","unstructured":"Nvidia developer tools overview. Accessed 15 Oct 2020"},{"key":"35_CR12","unstructured":"PerfWorks measurement library for Nsight Compute"},{"key":"35_CR13","unstructured":"Profiler user\u2019s guide. Accessed 15 Oct 2020"},{"key":"35_CR14","unstructured":"Roofline Methodology on NVIDIA GPUs"},{"key":"35_CR15","doi-asserted-by":"crossref","unstructured":"Chen, L.-C., Zhu, Y., Papandreou, G., Schroff, F., Adam, H.: Encoder-decoder with atrous separable convolution for semantic image segmentation. In: Proceedings of the European Conference on Computer Vision (ECCV), September 2018","DOI":"10.1007\/978-3-030-01234-2_49"},{"key":"35_CR16","unstructured":"Chetlur, S., et al.: cuDNN: efficient primitives for deep learning. arXiv preprint arXiv:1410.0759 (2014)"},{"key":"35_CR17","doi-asserted-by":"crossref","unstructured":"Choi, J.W., Bedard, D., Fowler, R., Vuduc, R.: A roofline model of energy. In: 2013 IEEE 27th International Symposium on Parallel and Distributed Processing, pp. 661\u2013672 (2013)","DOI":"10.1109\/IPDPS.2013.77"},{"key":"35_CR18","doi-asserted-by":"crossref","unstructured":"Ben, M.D., Yang, C., Louie, S., Deslippe, J.: Accelerating large-scale GW calculations on hybrid GPU-CPU systems. Bull. Am. Phys. Soc. 65 (2020)","DOI":"10.1109\/SC41405.2020.00008"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Ding, N., Williams, S.: An instruction roofline model for GPUs. In: 2019 IEEE\/ACM Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS), pp. 7\u201318. IEEE (2019)","DOI":"10.1109\/PMBS49563.2019.00007"},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Doerfler, D., et al.: Applying the roofline performance model to the Intel Xeon Phi knights landing processor. In: International Conference on High Performance Computing, pp. 339\u2013353. Springer (2016)","DOI":"10.1007\/978-3-319-46079-6_24"},{"key":"35_CR21","doi-asserted-by":"crossref","unstructured":"Gayatri, R., Yang, C., Kurth, T., Deslippe, J.: A case study for performance portability using OpenMP 4.5. In: International Workshop on Accelerator Programming Using Directives, pp. 75\u201395. Springer (2018)","DOI":"10.1007\/978-3-030-12274-4_4"},{"key":"35_CR22","unstructured":"Goodfellow, I., et al.: Generative adversarial nets. In: Advances in neural information processing systems, pp. 2672\u20132680 (2014)"},{"key":"35_CR23","doi-asserted-by":"crossref","unstructured":"Ibrahim, K.Z., Williams, S., Oliker, L.: Performance analysis FF GPU programming models using the roofline scaling trajectories. In: International Symposium on Benchmarking, Measuring and Optimization, pp. 3\u201319. Springer (2019)","DOI":"10.1007\/978-3-030-49556-5_1"},{"issue":"3","key":"35_CR24","doi-asserted-by":"publisher","first-page":"224","DOI":"10.1007\/s42514-019-00018-4","volume":"1","author":"MH Javed","year":"2019","unstructured":"Javed, M.H., Ibrahim, K.Z., Lu, X.: Performance analysis of deep learning workloads using roofline trajectories. CCF Trans. High Perform. Comput. 1(3), 224\u2013239 (2019)","journal-title":"CCF Trans. High Perform. Comput."},{"key":"35_CR25","doi-asserted-by":"crossref","unstructured":"Joubert, W., et al.: Attacking the opioid epidemic: determining the epistatic and pleiotropic genetic architectures for chronic pain and opioid addiction. In: SC18: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 717\u2013730. IEEE (2018)","DOI":"10.1109\/SC.2018.00060"},{"key":"35_CR26","doi-asserted-by":"crossref","unstructured":"Koskela, T., et al.: A novel multi-level integrated roofline model approach for performance characterization. In: International Conference on High Performance Computing, pp. 226\u2013245. Springer (2018)","DOI":"10.1007\/978-3-319-92040-5_12"},{"key":"35_CR27","doi-asserted-by":"crossref","unstructured":"Kurth, T., et al.: Exascale deep learning for climate analytics. In: SC18: International Conference for High Performance Computing, Networking, Storage and Analysis, pp. 649\u2013660. IEEE (2018)","DOI":"10.1109\/SC.2018.00054"},{"issue":"10","key":"35_CR28","first-page":"1995","volume":"3361","author":"Y LeCun","year":"1995","unstructured":"LeCun, Y., Bengio, Y., et al.: Convolutional networks for images, speech, and time series. Handb. Brain Theory Neural Netw. 3361(10), 1995 (1995)","journal-title":"Handb. Brain Theory Neural Netw."},{"key":"35_CR29","doi-asserted-by":"crossref","unstructured":"Lopes, A., Pratas, F., Sousa, L., Ilic, A.: Exploring GPU performance, power and energy-efficiency bounds with cache-aware roofline modeling. In: 2017 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pp. 259\u2013268 (2017)","DOI":"10.1109\/ISPASS.2017.7975297"},{"key":"35_CR30","doi-asserted-by":"crossref","unstructured":"Madsen, J.R., et al.: Timemory: modular performance analysis for HPC. In: International Conference on High Performance Computing, pp. 434\u2013452. Springer (2020)","DOI":"10.1007\/978-3-030-50743-5_22"},{"key":"35_CR31","unstructured":"Tesla NVIDIA. V100 GPU architecture. The world\u2019s most advanced data center GPU. version WP-08608-001_v1. 1. NVIDIA. Aug, p. 108 (2017)"},{"key":"35_CR32","unstructured":"Okuta, R., Unno, Y., Nishino, D., Hido, S., Loomis, C.: CuPy: a numpy-compatible library for NVIDIA GPU calculations. In: Proceedings of Workshop on Machine Learning Systems (LearningSys) in The Thirty-first Annual Conference on Neural Information Processing Systems (NIPS) (2017)"},{"key":"35_CR33","doi-asserted-by":"crossref","unstructured":"Wang, Y., Yang, C., Farrel, S., Zhang, Kurth, Y.T., Williams, S.: Time-based roofline for deep learning performance analysis. In: 2020 IEEE\/ACM Deep Learning on Supercomputers Workshop (2020, Submitted)","DOI":"10.1109\/DLS51937.2020.00007"},{"key":"35_CR34","doi-asserted-by":"crossref","unstructured":"Williams, S., Waterman, A., Patterson, D.: Roofline: an insightful visual performance model for floating-point programs and multicore architectures. Technical report, Lawrence Berkeley National Lab. (LBNL), Berkeley, CA, USA (2009)","DOI":"10.2172\/1407078"},{"key":"35_CR35","unstructured":"Yang, C.: 8 Steps to 3.7 TFLOP\/s on NVIDIA V100 GPU: Roofline analysis and other tricks"},{"key":"35_CR36","unstructured":"Yang, C.: Hierarchical roofline analysis: how to collect data using performance tools on Intel CPUs and NVIDIA GPUs"},{"key":"35_CR37","unstructured":"Yang, C., Friesen, B., Kurth, T., Cook, B., Williams, S.: Toward automated application profiling on cray systems. In: Cray User Group Conference (CUG) (2018)"},{"key":"35_CR38","doi-asserted-by":"crossref","unstructured":"Yang, C., et al.: An empirical roofline methodology for quantitatively assessing performance portability. In: 2018 IEEE\/ACM International Workshop on Performance, Portability and Productivity in HPC (P3HPC), pp. 14\u201323. IEEE (2018)","DOI":"10.1109\/P3HPC.2018.00005"},{"key":"35_CR39","doi-asserted-by":"crossref","first-page":"e5547","DOI":"10.1002\/cpe.5547","volume":"32","author":"C Yang","year":"2019","unstructured":"Yang, C., Kurth, T., Williams, S.: Hierarchical roofline analysis for GPUs: accelerating performance optimization for the NERSC-9 perlmutter system. Concurr. Comput. Pract. Exp. 32, e5547 (2019)","journal-title":"Concurr. Comput. Pract. Exp."}],"container-title":["Lecture Notes in Networks and Systems","Intelligent Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-80126-7_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,11,5]],"date-time":"2023-11-05T18:11:11Z","timestamp":1699207871000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-80126-7_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030801250","9783030801267"],"references-count":39,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-80126-7_35","relation":{},"ISSN":["2367-3370","2367-3389"],"issn-type":[{"type":"print","value":"2367-3370"},{"type":"electronic","value":"2367-3389"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"7 July 2021","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}