{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T18:11:09Z","timestamp":1771956669835,"version":"3.50.1"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031396977","type":"print"},{"value":"9783031396984","type":"electronic"}],"license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2023]]},"DOI":"10.1007\/978-3-031-39698-4_31","type":"book-chapter","created":{"date-parts":[[2023,8,23]],"date-time":"2023-08-23T06:02:40Z","timestamp":1692770560000},"page":"458-473","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["TrainBF: High-Performance DNN Training Engine Using BFloat16 on\u00a0AI Accelerators"],"prefix":"10.1007","author":[{"given":"Zhen","family":"Xie","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Siddhisanket","family":"Raskar","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Murali","family":"Emani","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Venkatram","family":"Vishwanath","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,8,24]]},"reference":[{"issue":"4","key":"31_CR1","doi-asserted-by":"publisher","first-page":"80","DOI":"10.1109\/38.595279","volume":"17","author":"JF Blinn","year":"1997","unstructured":"Blinn, J.F.: Floating-point tricks. IEEE Comput. Graphics Appl. 17(4), 80\u201384 (1997)","journal-title":"IEEE Comput. Graphics Appl."},{"key":"31_CR2","doi-asserted-by":"crossref","unstructured":"Burgess, N., Milanovic, J., Stephens, N., Monachopoulos, K., Mansell, D.: BFloat16 processing for neural networks. In: 2019 IEEE 26th Symposium on Computer Arithmetic (ARITH), pp. 88\u201391. IEEE (2019)","DOI":"10.1109\/ARITH.2019.00022"},{"issue":"2","key":"31_CR3","doi-asserted-by":"publisher","first-page":"29","DOI":"10.1109\/MM.2021.3061394","volume":"41","author":"J Choquette","year":"2021","unstructured":"Choquette, J., Gandhi, W., Giroux, O., Stam, N., Krashinsky, R.: NVIDIA A100 tensor core GPU: performance and innovation. IEEE Micro 41(2), 29\u201335 (2021)","journal-title":"IEEE Micro"},{"key":"31_CR4","unstructured":"contributors, W.: BFloat16 floating-point format (2021). https:\/\/en.wikipedia.org\/wiki\/Bfloat16_floating-point_format"},{"key":"31_CR5","unstructured":"Das, D., et al.: Mixed precision training of convolutional neural networks using integer operations. arXiv preprint arXiv:1802.00930 (2018)"},{"key":"31_CR6","unstructured":"Emani, M., et al.: A comprehensive evaluation of novel AI accelerators for deep learning workloads. In: 2022 IEEE\/ACM International Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS), pp. 13\u201325. IEEE (2022)"},{"key":"31_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"105","DOI":"10.1007\/978-3-030-58520-4_7","volume-title":"Computer Vision \u2013 ECCV 2020","author":"G Franchi","year":"2020","unstructured":"Franchi, G., Bursuc, A., Aldea, E., Dubuisson, S., Bloch, I.: TRADI: tracking deep neural network weight distributions. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.-M. (eds.) ECCV 2020. LNCS, vol. 12362, pp. 105\u2013121. Springer, Cham (2020). https:\/\/doi.org\/10.1007\/978-3-030-58520-4_7"},{"key":"31_CR8","unstructured":"Gupta, S., Agrawal, A., Gopalakrishnan, K., Narayanan, P.: Deep learning with limited numerical precision. In: International Conference on Machine Learning, pp. 1737\u20131746. PMLR (2015)"},{"key":"31_CR9","doi-asserted-by":"crossref","unstructured":"He, X., Chen, Z., Sun, J., Chen, H., Li, D., Quan, Z.: Exploring synchronization in cache coherent manycore systems: a case study with xeon phi. In: 2017 IEEE 23rd International Conference on Parallel and Distributed Systems (ICPADS), pp. 232\u2013239. IEEE (2017)","DOI":"10.1109\/ICPADS.2017.00040"},{"key":"31_CR10","doi-asserted-by":"crossref","unstructured":"He, X., et al.: Enabling energy-efficient DNN training on hybrid GPU-FPGA accelerators. In: Proceedings of the ACM International Conference on Supercomputing, pp. 227\u2013241 (2021)","DOI":"10.1145\/3447818.3460371"},{"key":"31_CR11","unstructured":"He, X., Sun, J., Chen, H., Li, D.: Campo: $$\\{$$Cost-Aware$$\\}$$ performance optimization for $$\\{$$Mixed-Precision$$\\}$$ neural network training. In: 2022 USENIX Annual Technical Conference (USENIX ATC 22), pp. 505\u2013518 (2022)"},{"key":"31_CR12","doi-asserted-by":"publisher","first-page":"35","DOI":"10.1016\/j.future.2021.04.011","volume":"123","author":"X He","year":"2021","unstructured":"He, X., Yao, Y., Chen, Z., Sun, J., Chen, H.: Efficient parallel A* search on multi-GPU system. Futur. Gener. Comput. Syst. 123, 35\u201347 (2021)","journal-title":"Futur. Gener. Comput. Syst."},{"key":"31_CR13","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. In: International Conference on Machine Learning, pp. 448\u2013456. PMLR (2015)"},{"key":"31_CR14","unstructured":"Jia, X., et al.: Highly scalable deep learning training system with mixed-precision: training ImageNet in four minutes. arXiv preprint arXiv:1807.11205 (2018)"},{"key":"31_CR15","unstructured":"Johnson, J.: Rethinking floating point for deep learning. arXiv preprint arXiv:1811.01721 (2018)"},{"key":"31_CR16","doi-asserted-by":"crossref","unstructured":"Johnston, J.T., et al.: Fine-grained exploitation of mixed precision for faster CNN training. In: 2019 IEEE\/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC), pp. 9\u201318. IEEE (2019)","DOI":"10.1109\/MLHPC49564.2019.00007"},{"key":"31_CR17","doi-asserted-by":"crossref","unstructured":"Kuchaiev, O., Ginsburg, B., Gitman, I., Lavrukhin, V., Case, C., Micikevicius, P.: OpenSeq2Seq: extensible toolkit for distributed and mixed precision training of sequence-to-sequence models. In: Proceedings of Workshop for NLP Open Source Software (NLP-OSS), pp. 41\u201346 (2018)","DOI":"10.18653\/v1\/W18-2507"},{"key":"31_CR18","unstructured":"Kuchaiev, O., et al.: Mixed-precision training for NLP and speech recognition with openseq2seq. arXiv preprint arXiv:1805.10387 (2018)"},{"key":"31_CR19","first-page":"336","volume":"2","author":"P Mattson","year":"2020","unstructured":"Mattson, P., et al.: MLPerf training benchmark. Proc. Mach. Learn. Syst. 2, 336\u2013349 (2020)","journal-title":"Proc. Mach. Learn. Syst."},{"key":"31_CR20","unstructured":"Mellempudi, N., Srinivasan, S., Das, D., Kaul, B.: Mixed precision training with 8-bit floating point. arXiv preprint arXiv:1905.12334 (2019)"},{"key":"31_CR21","unstructured":"Micikevicius, P., et al.: Mixed precision training. arXiv preprint arXiv:1710.03740 (2017)"},{"key":"31_CR22","unstructured":"Mishra, A., Nurvitadhi, E., Cook, J.J., Marr, D.: WRPN: wide reduced-precision networks. arXiv preprint arXiv:1709.01134 (2017)"},{"key":"31_CR23","unstructured":"PyTorch: Automatic Mixed Precision package (2022). https:\/\/pytorch.org\/docs\/stable\/amp.html. Accessed 1 Aug 2022"},{"key":"31_CR24","doi-asserted-by":"crossref","unstructured":"Seide, F., Fu, H., Droppo, J., Li, G., Yu, D.: 1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs. In: Fifteenth Annual Conference of the International Speech Communication Association. Citeseer (2014)","DOI":"10.21437\/Interspeech.2014-274"},{"issue":"12","key":"31_CR25","doi-asserted-by":"publisher","first-page":"2295","DOI":"10.1109\/JPROC.2017.2761740","volume":"105","author":"V Sze","year":"2017","unstructured":"Sze, V., Chen, Y.H., Yang, T.J., Emer, J.S.: Efficient processing of deep neural networks: a tutorial and survey. Proc. IEEE 105(12), 2295\u20132329 (2017)","journal-title":"Proc. IEEE"},{"key":"31_CR26","unstructured":"Ulyanov, D., Vedaldi, A., Lempitsky, V.: Instance normalization: the missing ingredient for fast stylization. arXiv preprint arXiv:1607.08022 (2016)"},{"key":"31_CR27","doi-asserted-by":"crossref","unstructured":"Wu, Y., He, K.: Group normalization. In: Proceedings of the European conference on computer vision (ECCV), pp. 3\u201319 (2018)","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"31_CR28","doi-asserted-by":"crossref","unstructured":"Xie, Z., Dong, W., Liu, J., Liu, H., Li, D.: Tahoe: tree structure-aware high performance inference engine for decision tree ensemble on GPU. In: Proceedings of the Sixteenth European Conference on Computer Systems, pp. 426\u2013440 (2021)","DOI":"10.1145\/3447786.3456251"},{"key":"31_CR29","doi-asserted-by":"crossref","unstructured":"Xie, Z., Dong, W., Liu, J., Peng, I., Ma, Y., Li, D.: MD-HM: memoization-based molecular dynamics simulations on big memory system. In: Proceedings of the ACM International Conference on Supercomputing, pp. 215\u2013226 (2021)","DOI":"10.1145\/3447818.3460365"},{"key":"31_CR30","doi-asserted-by":"crossref","unstructured":"Xie, Z., Liu, J., Li, J., Li, D.: Merchandiser: data placement on heterogeneous memory for task-parallel HPC applications with load-balance awareness (2023)","DOI":"10.1145\/3572848.3577497"},{"key":"31_CR31","doi-asserted-by":"crossref","unstructured":"Xie, Z., Tan, G., Liu, W., Sun, N.: IA-SpGEMM: an input-aware auto-tuning framework for parallel sparse matrix-matrix multiplication. In: Proceedings of the ACM International Conference on Supercomputing, pp. 94\u2013105 (2019)","DOI":"10.1145\/3330345.3330354"},{"key":"31_CR32","unstructured":"Zamirai, P., Zhang, J., Aberger, C.R., De Sa, C.: Revisiting BFloat16 training. arXiv preprint arXiv:2010.06192 (2020)"},{"issue":"3","key":"31_CR33","doi-asserted-by":"publisher","first-page":"739","DOI":"10.1109\/TSMCA.2011.2170414","volume":"42","author":"H Zhu","year":"2011","unstructured":"Zhu, H., Zhou, M., Alkins, R.: Group role assignment via a Kuhn-Munkres algorithm-based solution. IEEE Trans. Syst. Man Cybern.-Part A: Syst. Hum. 42(3), 739\u2013750 (2011)","journal-title":"IEEE Trans. Syst. Man Cybern.-Part A: Syst. Hum."},{"key":"31_CR34","doi-asserted-by":"crossref","unstructured":"Zvyagin, M., et al.: GenSLMs: genome-scale language models reveal SARS-CoV-2 evolutionary dynamics. bioRxiv, p. 2022\u201310 (2022)","DOI":"10.1101\/2022.10.10.511571"}],"container-title":["Lecture Notes in Computer Science","Euro-Par 2023: Parallel Processing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-39698-4_31","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,8,23]],"date-time":"2023-08-23T06:06:51Z","timestamp":1692770811000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-39698-4_31"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023]]},"ISBN":["9783031396977","9783031396984"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-39698-4_31","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023]]},"assertion":[{"value":"24 August 2023","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"Euro-Par","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Parallel Processing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Limassol","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Cyprus","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2023","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2023","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 September 2023","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"europar2023","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/2023.euro-par.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Single-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"easychair","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"164","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"49","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"30% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.98","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}