{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T14:34:11Z","timestamp":1777127651741,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,6,17]],"date-time":"2023-06-17T00:00:00Z","timestamp":1686960000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,6,17]]},"DOI":"10.1145\/3579371.3589351","type":"proceedings-article","created":{"date-parts":[[2023,6,16]],"date-time":"2023-06-16T20:25:28Z","timestamp":1686947128000},"page":"1-13","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":40,"title":["With Shared Microexponents, A Little Shifting Goes a Long Way"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8412-4320","authenticated-orcid":false,"given":"Bita","family":"Darvish Rouhani","sequence":"first","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1656-9165","authenticated-orcid":false,"given":"Ritchie","family":"Zhao","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7031-9020","authenticated-orcid":false,"given":"Venmugil","family":"Elango","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2996-2671","authenticated-orcid":false,"given":"Rasoul","family":"Shafipour","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2134-8247","authenticated-orcid":false,"given":"Mathew","family":"Hall","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1262-5836","authenticated-orcid":false,"given":"Maral","family":"Mesmakhosroshahi","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-2813-3988","authenticated-orcid":false,"given":"Ankit","family":"More","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9007-227X","authenticated-orcid":false,"given":"Levi","family":"Melnick","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-2893-5066","authenticated-orcid":false,"given":"Maximilian","family":"Golub","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-4504-0717","authenticated-orcid":false,"given":"Girish","family":"Varatkar","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6706-7076","authenticated-orcid":false,"given":"Lai","family":"Shao","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7807-6721","authenticated-orcid":false,"given":"Gaurav","family":"Kolhe","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3452-5406","authenticated-orcid":false,"given":"Dimitry","family":"Melts","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-7271-9238","authenticated-orcid":false,"given":"Jasmine","family":"Klar","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-4675-0755","authenticated-orcid":false,"given":"Renee","family":"L'Heureux","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4942-7570","authenticated-orcid":false,"given":"Matt","family":"Perry","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6588-6596","authenticated-orcid":false,"given":"Doug","family":"Burger","sequence":"additional","affiliation":[{"name":"Microsoft, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5871-4346","authenticated-orcid":false,"given":"Eric","family":"Chung","sequence":"additional","affiliation":[{"name":"Microsoft Azure, Redmond, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6946-5357","authenticated-orcid":false,"given":"Zhaoxia (Summer)","family":"Deng","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6511-1866","authenticated-orcid":false,"given":"Sam","family":"Naghshineh","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4750-9440","authenticated-orcid":false,"given":"Jongsoo","family":"Park","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6102-2903","authenticated-orcid":false,"given":"Maxim","family":"Naumov","sequence":"additional","affiliation":[{"name":"Meta, Menlo Park, USA"}]}],"member":"320","published-online":{"date-parts":[[2023,6,17]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"12449","article-title":"wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations","volume":"33","author":"Baevski Alexei","year":"2020","unstructured":"Alexei Baevski , Yuhao Zhou , Abdelrahman Mohamed , and Michael Auli . 2020 . wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations . Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 12449 -- 12460 . Alexei Baevski, Yuhao Zhou, Abdelrahman Mohamed, and Michael Auli. 2020. wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations. Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 12449--12460.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_2_1","volume-title":"Findings of the 2014 Workshop on Statistical Machine Translation. Conference on Machine Translation (WMT","author":"Bojar Ond\u0159ej","year":"2014","unstructured":"Ond\u0159ej Bojar , Christian Buck , Christian Federmann , Barry Haddow , Philipp Koehn , Johannes Leveling , Christof Monz , Pavel Pecina , Matt Post , Herve Saint-Amand , Radu Soricut , Lucia Specia , and Ale\u0161 Tamchyna . 2014 . Findings of the 2014 Workshop on Statistical Machine Translation. Conference on Machine Translation (WMT (2014), 12--58. Ond\u0159ej Bojar, Christian Buck, Christian Federmann, Barry Haddow, Philipp Koehn, Johannes Leveling, Christof Monz, Pavel Pecina, Matt Post, Herve Saint-Amand, Radu Soricut, Lucia Specia, and Ale\u0161 Tamchyna. 2014. Findings of the 2014 Workshop on Statistical Machine Translation. Conference on Machine Translation (WMT (2014), 12--58."},{"key":"e_1_3_2_1_3_1","first-page":"17","volume-title":"Findings of the 2017 Conference on Machine Translation (WMT17). Conference on Machine Translation (WMT (September","author":"Bojar Ond","year":"2017","unstructured":"Ond rej Bojar , Rajen Chatterjee , Christian Federmann , Yvette Graham , Barry Haddow , Shujian Huang , Matthias Huck , Philipp Koehn , Qun Liu , Varvara Logacheva , Christof Monz , Matteo Negri , Matt Post , Raphael Rubino , Lucia Specia , and Marco Turchi . 2017 . Findings of the 2017 Conference on Machine Translation (WMT17). Conference on Machine Translation (WMT (September 2017), 169--214. http:\/\/www.aclweb.org\/anthology\/W 17 - 4717 Ond rej Bojar, Rajen Chatterjee, Christian Federmann, Yvette Graham, Barry Haddow, Shujian Huang, Matthias Huck, Philipp Koehn, Qun Liu, Varvara Logacheva, Christof Monz, Matteo Negri, Matt Post, Raphael Rubino, Lucia Specia, and Marco Turchi. 2017. Findings of the 2017 Conference on Machine Translation (WMT17). Conference on Machine Translation (WMT (September 2017), 169--214. http:\/\/www.aclweb.org\/anthology\/W17-4717"},{"key":"e_1_3_2_1_4_1","first-page":"1877","article-title":"Language Models are Few-Shot Learners","volume":"33","author":"Brown Tom","year":"2020","unstructured":"Tom Brown , Benjamin Mann , Nick Ryder , Melanie Subbiah , Jared D Kaplan , Prafulla Dhariwal , Arvind Neelakantan , Pranav Shyam , Girish Sastry , Amanda Askell , Sandhini Agarwal , Ariel Herbert-Voss , Gretchen Krueger , Tom Henighan , Rewon Child , Aditya Ramesh , Daniel Ziegler , Jeffrey Wu , Clemens Winter , Chris Hesse , Mark Chen , Eric Sigler , Mateusz Litwin , Scott Gray , Benjamin Chess , Jack Clark , Christopher Berner , Sam McCandlish , Alec Radford , Ilya Sutskever , and Dario Amodei . 2020 . Language Models are Few-Shot Learners . Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 1877 -- 1901 . Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 1877--1901.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_5_1","volume-title":"Accuracy Boosters: Epoch-Driven Mixed-Mantissa Block Floating-Point for DNN Training. arXiv preprint arXiv:2211.10737","author":"Harma Simla Burcu","year":"2022","unstructured":"Simla Burcu Harma , Canberk S\u00f6nmez , Babak Falsafi , Martin Jaggi , and Yunho Oh . 2022 . Accuracy Boosters: Epoch-Driven Mixed-Mantissa Block Floating-Point for DNN Training. arXiv preprint arXiv:2211.10737 (2022). Simla Burcu Harma, Canberk S\u00f6nmez, Babak Falsafi, Martin Jaggi, and Yunho Oh. 2022. Accuracy Boosters: Epoch-Driven Mixed-Mantissa Block Floating-Point for DNN Training. arXiv preprint arXiv:2211.10737 (2022)."},{"key":"e_1_3_2_1_6_1","volume-title":"last accessed","author":"Image Convolutional Network","year":"2022","unstructured":"Convolutional Network for Image Classification in PyTorch . last accessed 2022 . https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Classification\/ConvNets Convolutional Network for Image Classification in PyTorch. last accessed 2022. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Classification\/ConvNets"},{"key":"e_1_3_2_1_7_1","volume-title":"VS-Quant: Per-vector Scaled Quantization for Accurate Low-Precision Neural Network Inference. Machine Learning and Systems (MLSys 3","author":"Dai Steve","year":"2021","unstructured":"Steve Dai , Rangha Venkatesan , Mark Ren , Brian Zimmer , William Dally , and Brucek Khailany . 2021. VS-Quant: Per-vector Scaled Quantization for Accurate Low-Precision Neural Network Inference. Machine Learning and Systems (MLSys 3 ( 2021 ), 873--884. Steve Dai, Rangha Venkatesan, Mark Ren, Brian Zimmer, William Dally, and Brucek Khailany. 2021. VS-Quant: Per-vector Scaled Quantization for Accurate Low-Precision Neural Network Inference. Machine Learning and Systems (MLSys 3 (2021), 873--884."},{"key":"e_1_3_2_1_8_1","first-page":"10271","article-title":"Pushing the Limits of Narrow Precision Inferencing at Cloud Scale with Microsoft Floating Point","volume":"33","author":"Rouhani Bita Darvish","year":"2020","unstructured":"Bita Darvish Rouhani , Daniel Lo , Ritchie Zhao , Ming Liu , Jeremy Fowers , Kalin Ovtcharov , Anna Vinogradsky , Sarah Massengill , Lita Yang , Ray Bittner , Alessandro Forin , Haishan Zhu , Taesik Na , Prerak Patel , Shuai Che , Lok Chand Koppaka , XIA SONG , Subhojit Som , Kaustav Das , Saurabh T, Steve Reinhardt , Sitaram Lanka , Eric Chung , and Doug Burger . 2020 . Pushing the Limits of Narrow Precision Inferencing at Cloud Scale with Microsoft Floating Point . Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 10271 -- 10281 . Bita Darvish Rouhani, Daniel Lo, Ritchie Zhao, Ming Liu, Jeremy Fowers, Kalin Ovtcharov, Anna Vinogradsky, Sarah Massengill, Lita Yang, Ray Bittner, Alessandro Forin, Haishan Zhu, Taesik Na, Prerak Patel, Shuai Che, Lok Chand Koppaka, XIA SONG, Subhojit Som, Kaustav Das, Saurabh T, Steve Reinhardt, Sitaram Lanka, Eric Chung, and Doug Burger. 2020. Pushing the Limits of Narrow Precision Inferencing at Cloud Scale with Microsoft Floating Point. Advances in Neural Information Processing Systems (NeurIPS) 33 (2020), 10271--10281.","journal-title":"Advances in Neural Information Processing Systems (NeurIPS)"},{"key":"e_1_3_2_1_9_1","volume-title":"last accessed","author":"Data-Efficient","year":"2022","unstructured":"Data-Efficient architectures and training for Image classification. last accessed 2022 . https:\/\/github.com\/facebookresearch\/deit Data-Efficient architectures and training for Image classification. last accessed 2022. https:\/\/github.com\/facebookresearch\/deit"},{"key":"e_1_3_2_1_10_1","volume-title":"last accessed","author":"Recommendation Deep Learning","year":"2022","unstructured":"Deep Learning Recommendation Model for Personalization and Recommendation Systems . last accessed 2022 . https:\/\/github.com\/facebookresearch\/dlrm Deep Learning Recommendation Model for Personalization and Recommendation Systems. last accessed 2022. https:\/\/github.com\/facebookresearch\/dlrm"},{"key":"e_1_3_2_1_11_1","volume-title":"last accessed","author":"Definition Hardware Numerics","year":"2022","unstructured":"BFLOAT16 Hardware Numerics Definition . last accessed 2022 . https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/bf16-hardware-numerics-definition-white-paper.pdf BFLOAT16 Hardware Numerics Definition. last accessed 2022. https:\/\/www.intel.com\/content\/dam\/develop\/external\/us\/en\/documents\/bf16-hardware-numerics-definition-white-paper.pdf"},{"key":"e_1_3_2_1_12_1","volume-title":"ImageNet: A Large-Scale Hierarchical Image Database. Conf. on Computer Vision and Pattern Recognition (CVPR)","author":"Deng Jia","year":"2009","unstructured":"Jia Deng , Wei Dong , Richard Socher , Li-Jia Li , Kai Li , and Li Fei-Fei . 2009 . ImageNet: A Large-Scale Hierarchical Image Database. Conf. on Computer Vision and Pattern Recognition (CVPR) (2009), 248--255. Jia Deng, Wei Dong, Richard Socher, Li-Jia Li, Kai Li, and Li Fei-Fei. 2009. ImageNet: A Large-Scale Hierarchical Image Database. Conf. on Computer Vision and Pattern Recognition (CVPR) (2009), 248--255."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3081981"},{"key":"e_1_3_2_1_14_1","volume-title":"The case for 4-bit precision: k-bit Inference Scaling Laws. arXiv preprint arXiv:2212.09720","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers and Luke Zettlemoyer . 2022. The case for 4-bit precision: k-bit Inference Scaling Laws. arXiv preprint arXiv:2212.09720 ( 2022 ). Tim Dettmers and Luke Zettlemoyer. 2022. The case for 4-bit precision: k-bit Inference Scaling Laws. arXiv preprint arXiv:2212.09720 (2022)."},{"key":"e_1_3_2_1_15_1","volume-title":"BERT: Pre-Training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (Oct.","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2018 . BERT: Pre-Training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (Oct. 2018). Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-Training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (Oct. 2018)."},{"key":"e_1_3_2_1_16_1","volume-title":"Training DNNs with Hybrid Block Floating Point. Advances in Neural Information Processing Systems (NeurIPS) 31","author":"Drumond Mario","year":"2018","unstructured":"Mario Drumond , Tao Lin , Martin Jaggi , and Babak Falsafi . 2018. Training DNNs with Hybrid Block Floating Point. Advances in Neural Information Processing Systems (NeurIPS) 31 ( 2018 ). Mario Drumond, Tao Lin, Martin Jaggi, and Babak Falsafi. 2018. Training DNNs with Hybrid Block Floating Point. Advances in Neural Information Processing Systems (NeurIPS) 31 (2018)."},{"key":"e_1_3_2_1_17_1","unstructured":"William Fedus Barret Zoph and Noam Shazeer. 2021. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity.  William Fedus Barret Zoph and Noam Shazeer. 2021. Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity."},{"key":"e_1_3_2_1_18_1","volume-title":"A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630","author":"Gholami Amir","year":"2021","unstructured":"Amir Gholami , Sehoon Kim , Zhen Dong , Zhewei Yao , Michael W Mahoney , and Kurt Keutzer . 2021. A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630 ( 2021 ). Amir Gholami, Sehoon Kim, Zhen Dong, Zhewei Yao, Michael W Mahoney, and Kurt Keutzer. 2021. A survey of quantization methods for efficient neural network inference. arXiv preprint arXiv:2103.13630 (2021)."},{"key":"e_1_3_2_1_19_1","volume-title":"last accessed","author":"For PyTorch GNMT","year":"2022","unstructured":"GNMT v2 For PyTorch . last accessed 2022 . https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Translation\/GNMT GNMT v2 For PyTorch. last accessed 2022. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Translation\/GNMT"},{"key":"e_1_3_2_1_20_1","volume-title":"Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.0338 (Dec","author":"He Kaiming","year":"2015","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . 2015. Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.0338 (Dec . 2015 ). Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.0338 (Dec. 2015)."},{"key":"e_1_3_2_1_21_1","volume-title":"last accessed","year":"2022","unstructured":"Improved-diffusion. last accessed 2022 . https:\/\/github.com\/openai\/improved-diffusion Improved-diffusion. last accessed 2022. https:\/\/github.com\/openai\/improved-diffusion"},{"key":"e_1_3_2_1_22_1","volume-title":"Nataraj Jammalamadaka, Jianyu Huang, Hector Yuen, et al.","author":"Kalamkar Dhiraj","year":"2019","unstructured":"Dhiraj Kalamkar , Dheevatsa Mudigere , Naveen Mellempudi , Dipankar Das , Kunal Banerjee , Sasikanth Avancha , Dharma Teja Vooturi , Nataraj Jammalamadaka, Jianyu Huang, Hector Yuen, et al. 2019 . A study of BFLOAT 16 for deep learning training. arXiv preprint arXiv:1905.12322 (2019). Dhiraj Kalamkar, Dheevatsa Mudigere, Naveen Mellempudi, Dipankar Das, Kunal Banerjee, Sasikanth Avancha, Dharma Teja Vooturi, Nataraj Jammalamadaka, Jianyu Huang, Hector Yuen, et al. 2019. A study of BFLOAT16 for deep learning training. arXiv preprint arXiv:1905.12322 (2019)."},{"key":"e_1_3_2_1_23_1","unstructured":"Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A Study of BFLOAT16 for Deep Learning Training. arXiv preprint arXiv:1905.12322 (2019).  Dhiraj Kalamkar Dheevatsa Mudigere Naveen Mellempudi Dipankar Das Kunal Banerjee Sasikanth Avancha Dharma Teja Vooturi Nataraj Jammalamadaka Jianyu Huang Hector Yuen Jiyan Yang Jongsoo Park Alexander Heinecke Evangelos Georganas Sudarshan Srinivasan Abhisek Kundu Misha Smelyanskiy Bharat Kaul and Pradeep Dubey. 2019. A Study of BFLOAT16 for Deep Learning Training. arXiv preprint arXiv:1905.12322 (2019)."},{"key":"e_1_3_2_1_24_1","volume-title":"Scaling Laws for Neural Language Models. arXiv preprint arXiv:2001.08361","author":"Kaplan Jared","year":"2020","unstructured":"Jared Kaplan , Sam McCandlish , Tom Henighan , Tom B Brown , Benjamin Chess , Rewon Child , Scott Gray , Alec Radford , Jeffrey Wu , and Dario Amodei . 2020. Scaling Laws for Neural Language Models. arXiv preprint arXiv:2001.08361 ( 2020 ). Jared Kaplan, Sam McCandlish, Tom Henighan, Tom B Brown, Benjamin Chess, Rewon Child, Scott Gray, Alec Radford, Jeffrey Wu, and Dario Amodei. 2020. Scaling Laws for Neural Language Models. arXiv preprint arXiv:2001.08361 (2020)."},{"key":"e_1_3_2_1_25_1","volume-title":"Mixed Precision Training. Int'l Conf. on Learning Representations (ICLR)","author":"Micikevicius Paulius","year":"2018","unstructured":"Paulius Micikevicius , Sharan Narang , Jonah Alben , Gregory Diamos , Erich Elsen , David Garcia , Boris Ginsburg , Michael Houston , Oleksii Kuchaiev , Ganesh Venkatesh , and Hao Wu. 2018. Mixed Precision Training. Int'l Conf. on Learning Representations (ICLR) ( 2018 ). Paulius Micikevicius, Sharan Narang, Jonah Alben, Gregory Diamos, Erich Elsen, David Garcia, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, and Hao Wu. 2018. Mixed Precision Training. Int'l Conf. on Learning Representations (ICLR) (2018)."},{"key":"e_1_3_2_1_26_1","volume-title":"FP8 Formats for Deep Learning. arXiv preprint arXiv:2209.05433","author":"Micikevicius Paulius","year":"2022","unstructured":"Paulius Micikevicius , Dusan Stosic , Neil Burgess , Marius Cornea , Pradeep Dubey , Richard Grisenthwaite , Sangwon Ha , Alexander Heinecke , Patrick Judd , John Kamalu , Naveen Mellempudi , Stuart Oberman , Mohammad Shoeybi , Michael Siu , and Hao Wu. 2022. FP8 Formats for Deep Learning. arXiv preprint arXiv:2209.05433 ( 2022 ). Paulius Micikevicius, Dusan Stosic, Neil Burgess, Marius Cornea, Pradeep Dubey, Richard Grisenthwaite, Sangwon Ha, Alexander Heinecke, Patrick Judd, John Kamalu, Naveen Mellempudi, Stuart Oberman, Mohammad Shoeybi, Michael Siu, and Hao Wu. 2022. FP8 Formats for Deep Learning. arXiv preprint arXiv:2209.05433 (2022)."},{"key":"e_1_3_2_1_27_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv preprint arXiv:1906.00091 (2019).  Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv preprint arXiv:1906.00091 (2019)."},{"key":"e_1_3_2_1_28_1","volume-title":"Improved Denoising Diffusion Probabilistic Models. arXiv preprint arXiv:2102.09672","author":"Nichol Alex","year":"2021","unstructured":"Alex Nichol and Prafulla Dhariwal . 2021. Improved Denoising Diffusion Probabilistic Models. arXiv preprint arXiv:2102.09672 ( 2021 ). Alex Nichol and Prafulla Dhariwal. 2021. Improved Denoising Diffusion Probabilistic Models. arXiv preprint arXiv:2102.09672 (2021)."},{"key":"e_1_3_2_1_29_1","unstructured":"NVIDIA. 2022. Using FP8 with Transformer Engine. https:\/\/docs.nvidia.com\/deeplearning\/transformer-engine\/user-guide\/examples\/fp8_primer.html#Introduction-to-FP8  NVIDIA. 2022. Using FP8 with Transformer Engine. https:\/\/docs.nvidia.com\/deeplearning\/transformer-engine\/user-guide\/examples\/fp8_primer.html#Introduction-to-FP8"},{"key":"e_1_3_2_1_30_1","volume-title":"Ongoing research training transformer. last accessed","year":"2022","unstructured":"NVIDIA\/Megatron-LM : Ongoing research training transformer. last accessed 2022 . https:\/\/github.com\/NVIDIA\/Megatron-LM NVIDIA\/Megatron-LM: Ongoing research training transformer. last accessed 2022. https:\/\/github.com\/NVIDIA\/Megatron-LM"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_32_1","volume-title":"Efficiently Scaling Transformer Inference. arXiv preprint arXiv:2211.05102","author":"Pope Reiner","year":"2022","unstructured":"Reiner Pope , Sholto Douglas , Aakanksha Chowdhery , Jacob Devlin , James Bradbury , Anselm Levskaya , Jonathan Heek , Kefan Xiao , Shivani Agrawal , and Jeff Dean . 2022. Efficiently Scaling Transformer Inference. arXiv preprint arXiv:2211.05102 ( 2022 ). Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Anselm Levskaya, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2022. Efficiently Scaling Transformer Inference. arXiv preprint arXiv:2211.05102 (2022)."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00067"},{"key":"e_1_3_2_1_34_1","volume-title":"Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He.","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari , Conglong Li , Zhewei Yao , Minjia Zhang , Reza Yazdani Aminabadi , Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022 . Deepspeed-moe : Advancing mixture-of-experts inference and training to power next-generation ai scale. arXiv preprint arXiv:2201.05596 (2022). Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza Yazdani Aminabadi, Ammar Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. Deepspeed-moe: Advancing mixture-of-experts inference and training to power next-generation ai scale. arXiv preprint arXiv:2201.05596 (2022)."},{"key":"e_1_3_2_1_35_1","volume-title":"100,000+ Questions for Machine Comprehension of Text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar , Jian Zhang , Konstantin Lopyrev , and Percy Liang . 2016. SQuAD : 100,000+ Questions for Machine Comprehension of Text. arXiv preprint arXiv:1606.05250 ( 2016 ). Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. SQuAD: 100,000+ Questions for Machine Comprehension of Text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_1_36_1","volume-title":"Conf. on Computer Vision and Pattern Recognition (CVPR)","author":"Sandler Mark","year":"2018","unstructured":"Mark Sandler , Andrew Howard , Menglong Zhu , Andrey Zhmoginov , and Liang-Chieh Chen . 2018 . Mobilenetv2: Inverted Residuals and Linear Bottlenecks . Conf. on Computer Vision and Pattern Recognition (CVPR) (2018), 4510--4520. Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, and Liang-Chieh Chen. 2018. Mobilenetv2: Inverted Residuals and Linear Bottlenecks. Conf. on Computer Vision and Pattern Recognition (CVPR) (2018), 4510--4520."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6409"},{"key":"e_1_3_2_1_38_1","volume-title":"accessed","author":"Stosic Dusan","year":"2022","unstructured":"Dusan Stosic and Paulius Micikevicius . 2021 , accessed Dec 2022 . Accelerating AI Training with NVIDIA TF32 Tensor Cores. Nvidia blog (2021, accessed Dec 2022). https:\/\/developer.nvidia.com\/blog\/accelerating-ai-training-with-tf32-tensor-cores\/ Dusan Stosic and Paulius Micikevicius. 2021, accessed Dec 2022. Accelerating AI Training with NVIDIA TF32 Tensor Cores. Nvidia blog (2021, accessed Dec 2022). https:\/\/developer.nvidia.com\/blog\/accelerating-ai-training-with-tf32-tensor-cores\/"},{"key":"e_1_3_2_1_39_1","volume-title":"Xiaodong Cui, Wei Zhang, and Kailash Gopalakrishnan.","author":"Sun Xiao","year":"2019","unstructured":"Xiao Sun , Jungwook Choi , Chia-Yu Chen , Naigang Wang , Swagath Venkataramani , Vijayalakshmi Viji Srinivasan , Xiaodong Cui, Wei Zhang, and Kailash Gopalakrishnan. 2019 . Hybrid 8-bit Floating Point (HFP8) Training and Inference for Deep Neural Networks. Advances in Neural Information Processing Systems (NeurIPS) 32 (2019). Xiao Sun, Jungwook Choi, Chia-Yu Chen, Naigang Wang, Swagath Venkataramani, Vijayalakshmi Viji Srinivasan, Xiaodong Cui, Wei Zhang, and Kailash Gopalakrishnan. 2019. Hybrid 8-bit Floating Point (HFP8) Training and Inference for Deep Neural Networks. Advances in Neural Information Processing Systems (NeurIPS) 32 (2019)."},{"key":"e_1_3_2_1_40_1","volume-title":"Using multivariate statistics","author":"Tabachnick Barbara G","unstructured":"Barbara G Tabachnick , Linda S Fidell , and Jodie B Ullman . 2007. Using multivariate statistics . Vol. 5 . pearson Boston , MA. Barbara G Tabachnick, Linda S Fidell, and Jodie B Ullman. 2007. Using multivariate statistics. Vol. 5. pearson Boston, MA."},{"key":"e_1_3_2_1_41_1","volume-title":"last accessed","author":"Torchvision","year":"2022","unstructured":"Torchvision MobileNetV2. last accessed 2022 . https:\/\/github.com\/pytorch\/vision Torchvision MobileNetV2. last accessed 2022. https:\/\/github.com\/pytorch\/vision"},{"key":"e_1_3_2_1_42_1","volume-title":"Training Data-Efficient Image Transformers & Distillation Through Attention. Int'l Conf. on Machine Learning (ICML) 139 (July","author":"Touvron Hugo","year":"2021","unstructured":"Hugo Touvron , Matthieu Cord , Matthijs Douze , Francisco Massa , Alexandre Sablayrolles , and Herve Jegou . 2021. Training Data-Efficient Image Transformers & Distillation Through Attention. Int'l Conf. on Machine Learning (ICML) 139 (July 2021 ), 10347--10357. Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, and Herve Jegou. 2021. Training Data-Efficient Image Transformers & Distillation Through Attention. Int'l Conf. on Machine Learning (ICML) 139 (July 2021), 10347--10357."},{"key":"e_1_3_2_1_43_1","volume-title":"last accessed","author":"PyTorch Transformer For","year":"2022","unstructured":"Transformer For PyTorch . last accessed 2022 . https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Translation\/Transformer Transformer For PyTorch. last accessed 2022. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Translation\/Transformer"},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is All You Need. Advances in Neural Information Processing Systems (NeurIPS) 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani , Noam Shazeer , Niki Parmar , Jakob Uszkoreit , Llion Jones , Aidan N Gomez , \u0141ukasz Kaiser , and Illia Polosukhin . 2017. Attention is All You Need. Advances in Neural Information Processing Systems (NeurIPS) 30 ( 2017 ). Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All You Need. Advances in Neural Information Processing Systems (NeurIPS) 30 (2017)."},{"key":"e_1_3_2_1_45_1","volume-title":"Training Deep Neural Networks with 8-bit Floating Point Numbers. Advances in Neural Information Processing Systems (NeurIPS) 31","author":"Wang Naigang","year":"2018","unstructured":"Naigang Wang , Jungwook Choi , Daniel Brand , Chia-Yu Chen , and Kailash Gopalakrishnan . 2018. Training Deep Neural Networks with 8-bit Floating Point Numbers. Advances in Neural Information Processing Systems (NeurIPS) 31 ( 2018 ). Naigang Wang, Jungwook Choi, Daniel Brand, Chia-Yu Chen, and Kailash Gopalakrishnan. 2018. Training Deep Neural Networks with 8-bit Floating Point Numbers. Advances in Neural Information Processing Systems (NeurIPS) 31 (2018)."},{"key":"e_1_3_2_1_46_1","volume-title":"last accessed","year":"2022","unstructured":"wav2vec 2.0. last accessed 2022 . https:\/\/github.com\/facebookresearch\/fairseq\/tree\/main\/examples\/wav2vec wav2vec 2.0. last accessed 2022. https:\/\/github.com\/facebookresearch\/fairseq\/tree\/main\/examples\/wav2vec"},{"key":"e_1_3_2_1_47_1","volume-title":"CMOS VLSI Design: A Circuits and Systems Perspective","author":"Weste Neil","unstructured":"Neil Weste and David Harris . 2010. CMOS VLSI Design: A Circuits and Systems Perspective ( 4 th ed.). Addison-Wesley Publishing Company , USA , 457. Neil Weste and David Harris. 2010. CMOS VLSI Design: A Circuits and Systems Perspective (4th ed.). Addison-Wesley Publishing Company, USA, 457.","edition":"4"},{"key":"e_1_3_2_1_48_1","volume-title":"Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv preprint arXiv:1609.08144","author":"Wu Yonghui","year":"2016","unstructured":"Yonghui Wu , Mike Schuster , Zhifeng Chen , Quoc V. Le , Mohammad Norouzi , Wolfgang Macherey , Maxim Krikun , Yuan Cao , Qin Gao , Klaus Macherey , Jeff Klingner , Apurva Shah , Melvin Johnson , Xiaobing Liu , \u0141ukasz Kaiser , Stephan Gouws , Yoshikiyo Kato , Taku Kudo , Hideto Kazawa , Keith Stevens , George Kurian , Nishant Patil , Wei Wang , Cliff Young , Jason Smith , Jason Riesa , Alex Rudnick , Oriol Vinyals , Greg Corrado , Macduff Hughes , and Jeffrey Dean . 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv preprint arXiv:1609.08144 ( 2016 ). Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi, Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey, Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, \u0141ukasz Kaiser, Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens, George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa, Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, and Jeffrey Dean. 2016. Google's Neural Machine Translation System: Bridging the Gap between Human and Machine Translation. arXiv preprint arXiv:1609.08144 (2016)."},{"key":"e_1_3_2_1_49_1","volume-title":"Smoothquant: Accurate and efficient post-training quantization for large language models. arXiv preprint arXiv:2211.10438","author":"Xiao Guangxuan","year":"2022","unstructured":"Guangxuan Xiao , Ji Lin , Mickael Seznec , Julien Demouth , and Song Han . 2022 . Smoothquant: Accurate and efficient post-training quantization for large language models. arXiv preprint arXiv:2211.10438 (2022). Guangxuan Xiao, Ji Lin, Mickael Seznec, Julien Demouth, and Song Han. 2022. Smoothquant: Accurate and efficient post-training quantization for large language models. arXiv preprint arXiv:2211.10438 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"Be Like Water: Adaptive Floating Point for Machine Learning. In International Conference on Machine Learning. PMLR, 25490--25500","author":"Yeh Thomas","year":"2022","unstructured":"Thomas Yeh , Max Sterner , Zerlina Lai , Brandon Chuang , and Alexander Ihler . 2022 . Be Like Water: Adaptive Floating Point for Machine Learning. In International Conference on Machine Learning. PMLR, 25490--25500 . Thomas Yeh, Max Sterner, Zerlina Lai, Brandon Chuang, and Alexander Ihler. 2022. Be Like Water: Adaptive Floating Point for Machine Learning. In International Conference on Machine Learning. PMLR, 25490--25500."},{"key":"e_1_3_2_1_51_1","volume-title":"DHEN: A Deep and Hierarchical Ensemble Network for Large-Scale Click-Through Rate Prediction. arXiv preprint arXiv:2203.11014","author":"Zhang Buyun","year":"2022","unstructured":"Buyun Zhang , Liang Luo , Xi Liu , Jay Li , Zeliang Chen , Weilin Zhang , Xiaohan Wei , Yuchen Hao , Michael Tsang , Wenjun Wang , Yang Liu , Huayu Li , Yasmine Badr , Jongsoo Park , Jiyan Yang , Dheevatsa Mudigere , and Ellie Wen . 2022 . DHEN: A Deep and Hierarchical Ensemble Network for Large-Scale Click-Through Rate Prediction. arXiv preprint arXiv:2203.11014 (2022). Buyun Zhang, Liang Luo, Xi Liu, Jay Li, Zeliang Chen, Weilin Zhang, Xiaohan Wei, Yuchen Hao, Michael Tsang, Wenjun Wang, Yang Liu, Huayu Li, Yasmine Badr, Jongsoo Park, Jiyan Yang, Dheevatsa Mudigere, and Ellie Wen. 2022. DHEN: A Deep and Hierarchical Ensemble Network for Large-Scale Click-Through Rate Prediction. arXiv preprint arXiv:2203.11014 (2022)."}],"event":{"name":"ISCA '23: 50th Annual International Symposium on Computer Architecture","location":"Orlando FL USA","acronym":"ISCA '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE"]},"container-title":["Proceedings of the 50th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589351","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:40Z","timestamp":1750178800000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3579371.3589351"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,6,17]]},"references-count":51,"alternative-id":["10.1145\/3579371.3589351","10.1145\/3579371"],"URL":"https:\/\/doi.org\/10.1145\/3579371.3589351","relation":{},"subject":[],"published":{"date-parts":[[2023,6,17]]},"assertion":[{"value":"2023-06-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}