{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T02:28:36Z","timestamp":1750127316140,"version":"3.40.3"},"publisher-location":"Cham","reference-count":19,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030343552"},{"type":"electronic","value":"9783030343569"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-34356-9_37","type":"book-chapter","created":{"date-parts":[[2019,12,2]],"date-time":"2019-12-02T18:37:03Z","timestamp":1575311823000},"page":"490-503","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["MagmaDNN: Towards High-Performance Data Analytics and Machine Learning for Data-Driven Scientific Computing"],"prefix":"10.1007","author":[{"given":"Daniel","family":"Nichols","sequence":"first","affiliation":[]},{"given":"Nathalie-Sofia","family":"Tomov","sequence":"additional","affiliation":[]},{"given":"Frank","family":"Betancourt","sequence":"additional","affiliation":[]},{"given":"Stanimire","family":"Tomov","sequence":"additional","affiliation":[]},{"given":"Kwai","family":"Wong","sequence":"additional","affiliation":[]},{"given":"Jack","family":"Dongarra","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,12,3]]},"reference":[{"key":"37_CR1","unstructured":"Abadi, M., et al.: TensorFlow: large-scale machine learning on heterogeneous distributed systems. CoRR abs\/1603.04467 (2016). \nhttp:\/\/arxiv.org\/abs\/1603.04467"},{"key":"37_CR2","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1007\/978-3-319-41321-1_2","volume-title":"High Performance Computing","author":"A Abdelfattah","year":"2016","unstructured":"Abdelfattah, A., Haidar, A., Tomov, S., Dongarra, J.: Performance, design, and autotuning of batched GEMM for GPUs. In: Kunkel, J.M., Balaji, P., Dongarra, J. (eds.) ISC High Performance 2016. LNCS, vol. 9697, pp. 21\u201338. Springer, Cham (2016). \nhttps:\/\/doi.org\/10.1007\/978-3-319-41321-1_2"},{"key":"37_CR3","unstructured":"Ben-Nun, T., Hoefler, T.: Demystifying parallel and distributed deep learning: an in-depth concurrency analysis. CoRR abs\/1802.09941 (2018). \nhttp:\/\/arxiv.org\/abs\/1802.09941"},{"key":"37_CR4","unstructured":"Chen, J., Monga, R., Bengio, S., J\u00f3zefowicz, R.: Revisiting distributed synchronous SGD. CoRR abs\/1604.00981 (2016). \nhttp:\/\/arxiv.org\/abs\/1604.00981"},{"key":"37_CR5","unstructured":"Chen, S., Gessinger, A., Tomov, S.: Design and acceleration of convolutional neural networks on modern architectures. Technical report, Joint Institute for Computational Sciences (JICS), UTK (2018). 2018 Summer Research Experiences for Undergraduate (REU), Knoxville, TN 2018"},{"key":"37_CR6","unstructured":"Chetlur, S., et al.: cuDNN: efficient primitives for deep learning. CoRR abs\/1410.0759 (2014). \nhttp:\/\/arxiv.org\/abs\/1410.0759"},{"key":"37_CR7","doi-asserted-by":"publisher","first-page":"3","DOI":"10.1016\/j.parco.2017.10.004","volume":"74","author":"M Gates","year":"2018","unstructured":"Gates, M., Tomov, S., Dongarra, J.: Accelerating the SVD two stage bidiagonal reduction and divide and conquer using GPUs. Parallel Comput. 74, 3\u201318 (2018). \nhttps:\/\/doi.org\/10.1016\/j.parco.2017.10.004\n\n. \nhttp:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167819117301758\n\n. Parallel Matrix Algorithms and Applications (PMAA\u201916)","journal-title":"Parallel Comput."},{"key":"37_CR8","unstructured":"Goyal, P., et al.: Accurate, large minibatch SGD: training imagenet in 1 hour. CoRR abs\/1706.02677 (2017). \nhttp:\/\/arxiv.org\/abs\/1706.02677"},{"key":"37_CR9","unstructured":"Iandola, F.N., Ashraf, K., Moskewicz, M.W., Keutzer, K.: FireCaffe: near-linear acceleration of deep neural network training on compute clusters. CoRR abs\/1511.00175 (2015). \nhttp:\/\/arxiv.org\/abs\/1511.00175"},{"key":"37_CR10","unstructured":"Jia, Y., et al.: Caffe: convolutional architecture for fast feature embedding. CoRR abs\/1408.5093 (2014). \nhttp:\/\/arxiv.org\/abs\/1408.5093"},{"issue":"11","key":"37_CR11","doi-asserted-by":"publisher","first-page":"2278","DOI":"10.1109\/5.726791","volume":"86","author":"Y Lecun","year":"1998","unstructured":"Lecun, Y., Bottou, L., Bengio, Y., Haffner, P.: Gradient-based learning applied to document recognition. Proc. IEEE 86(11), 2278\u20132324 (1998). \nhttps:\/\/doi.org\/10.1109\/5.726791","journal-title":"Proc. IEEE"},{"key":"37_CR12","unstructured":"Smith, S.L., Kindermans, P., Le, Q.V.: Don\u2019t decay the learning rate, increase the batch size. CoRR abs\/1711.00489 (2017). \nhttp:\/\/arxiv.org\/abs\/1711.00489"},{"key":"37_CR13","doi-asserted-by":"publisher","unstructured":"Sorna, A., Cheng, X., D\u2019Azevedo, E., Wong, K., Tomov, S.: Optimizing the fast fourier transform using mixed precision on tensor core hardware. In: 2018 IEEE 25th International Conference on High Performance Computing Workshops (HiPCW). pp. 3\u20137, December 2018. \nhttps:\/\/doi.org\/10.1109\/HiPCW.2018.8634417","DOI":"10.1109\/HiPCW.2018.8634417"},{"key":"37_CR14","unstructured":"Tomov, N., Tomov, S.: On deep neural networks for detecting heart disease. CoRR abs\/1808.07168 (2018). \nhttp:\/\/arxiv.org\/abs\/1808.07168"},{"issue":"5","key":"37_CR15","doi-asserted-by":"publisher","first-page":"232","DOI":"10.1016\/j.parco.2009.12.005","volume":"36","author":"S Tomov","year":"2010","unstructured":"Tomov, S., Dongarra, J., Baboulin, M.: Towards dense linear algebra for hybrid GPU accelerated manycore systems. Parallel Comput. 36(5), 232\u2013240 (2010). \nhttps:\/\/doi.org\/10.1016\/j.parco.2009.12.005\n\n. \nhttp:\/\/www.sciencedirect.com\/science\/article\/pii\/S0167819109001276\n\n. Parallel Matrix Algorithms and Applications","journal-title":"Parallel Comput."},{"key":"37_CR16","unstructured":"Tomov, S., Haidar, A., Ayala, A., Schultz, D., Dongarra, J.: Design and implementation for FFT-ECP on distributed accelerated systems. ECP WBS 2.3.3.09 Milestone Report FFT-ECP ST-MS-10-1410, Innovative Computing Laboratory, University of Tennessee, April 2019. 04\u20132019 revision"},{"key":"37_CR17","doi-asserted-by":"crossref","unstructured":"Wong, K., Brown, L., Coan, J., White, D.: Distributive interoperable executive library (DIEL) for systems of multiphysics simulation. In: 2014 15th International Conference on Parallel and Distributed Computing, Applications and Technologies, pp. 49\u201355. IEEE (2014)","DOI":"10.1109\/PDCAT.2014.16"},{"key":"37_CR18","unstructured":"You, Y., Gitman, I., Ginsburg, B.: Scaling SGD batch size to 32k for imagenet training. CoRR abs\/1708.03888 (2017). \nhttp:\/\/arxiv.org\/abs\/1708.03888"},{"key":"37_CR19","unstructured":"You, Y., Zhang, Z., Hsieh, C., Demmel, J.: 100-epoch imagenet training with AlexNet in 24 minutes. CoRR abs\/1709.05011 (2017). \nhttp:\/\/arxiv.org\/abs\/1709.05011"}],"container-title":["Lecture Notes in Computer Science","High Performance Computing"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-34356-9_37","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,5,13]],"date-time":"2020-05-13T16:30:29Z","timestamp":1589387429000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-34356-9_37"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030343552","9783030343569"],"references-count":19,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-34356-9_37","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"3 December 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ISC High Performance","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on High Performance Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Frankfurt","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2019","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 June 2019","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"20 June 2019","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"34","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"supercomputing2019","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.isc-hpc.com\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Linklings","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"70","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"48","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"69% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4-5","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"n\/a","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}