{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,10]],"date-time":"2026-06-10T16:43:55Z","timestamp":1781109835306,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":71,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,6,11]],"date-time":"2022-06-11T00:00:00Z","timestamp":1654905600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,6,18]]},"DOI":"10.1145\/3470496.3533727","type":"proceedings-article","created":{"date-parts":[[2022,5,31]],"date-time":"2022-05-31T19:06:01Z","timestamp":1654023961000},"page":"993-1011","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":109,"title":["Software-hardware co-design for fast and scalable training of deep learning recommendation models"],"prefix":"10.1145","author":[{"given":"Dheevatsa","family":"Mudigere","sequence":"first","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuchen","family":"Hao","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jianyu","family":"Huang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhihao","family":"Jia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Andrew","family":"Tulloch","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Srinivas","family":"Sridharan","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xing","family":"Liu","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mustafa","family":"Ozdal","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jade","family":"Nie","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jongsoo","family":"Park","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Liang","family":"Luo","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie (Amy)","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Leon","family":"Gao","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dmytro","family":"Ivchenko","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Aarti","family":"Basant","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yuxi","family":"Hu","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiyan","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ehsan K.","family":"Ardestani","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Xiaodong","family":"Wang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rakesh","family":"Komuravelli","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ching-Hsiang","family":"Chu","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Serhat","family":"Yilmaz","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Huayu","family":"Li","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jiyuan","family":"Qian","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhuobo","family":"Feng","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Yinbin","family":"Ma","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Junjie","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ellie","family":"Wen","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Hong","family":"Li","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lin","family":"Yang","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Chonglin","family":"Sun","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Whitney","family":"Zhao","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dimitry","family":"Melts","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Krishna","family":"Dhulipala","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"KR","family":"Kishore","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Tyler","family":"Graf","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Assaf","family":"Eisenman","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kiran Kumar","family":"Matam","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Adi","family":"Gangidi","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guoqiang Jerry","family":"Chen","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Manoj","family":"Krishnan","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Avinash","family":"Nayak","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Krishnakumar","family":"Nair","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bharath","family":"Muthiah","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mahmoud","family":"khorashadi","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Pallab","family":"Bhattacharya","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Petr","family":"Lapukhov","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Maxim","family":"Naumov","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ajit","family":"Mathews","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Lin","family":"Qiao","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mikhail","family":"Smelyanskiy","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bill","family":"Jia","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Vijay","family":"Rao","sequence":"additional","affiliation":[{"name":"Meta Platforms"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,6,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n.d.]. NVIDIA Collective Communications Library (NCCL) https:\/\/developer.nvidia.com\/nccl.  [n.d.]. NVIDIA Collective Communications Library (NCCL) https:\/\/developer.nvidia.com\/nccl."},{"key":"e_1_3_2_1_2_1","unstructured":"[n.d.]. OCP Open rack standard (v2) https:\/\/www.opencompute.org\/wiki\/Open_Rack\/SpecsAndDesigns#RACK_Standards.  [n.d.]. OCP Open rack standard (v2) https:\/\/www.opencompute.org\/wiki\/Open_Rack\/SpecsAndDesigns#RACK_Standards."},{"key":"e_1_3_2_1_3_1","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org.  Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"crossref","unstructured":"Bilge Acun Matthew Murphy Xiaodong Wang Jade Nie Carole-Jean Wu and Kim Hazelwood. 2020. Understanding Training Efficiency of Deep Learning Recommendation Models at Scale. arXiv:2011.05497 [cs.AR]  Bilge Acun Matthew Murphy Xiaodong Wang Jade Nie Carole-Jean Wu and Kim Hazelwood. 2020. Understanding Training Efficiency of Deep Learning Recommendation Models at Scale. arXiv:2011.05497 [cs.AR]","DOI":"10.1109\/HPCA51647.2021.00072"},{"key":"e_1_3_2_1_5_1","unstructured":"Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL]  Tom B. Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry Amanda Askell Sandhini Agarwal Ariel Herbert-Voss Gretchen Krueger Tom Henighan Rewon Child Aditya Ramesh Daniel M. Ziegler Jeffrey Wu Clemens Winter Christopher Hesse Mark Chen Eric Sigler Mateusz Litwin Scott Gray Benjamin Chess Jack Clark Christopher Berner Sam McCandlish Alec Radford Ilya Sutskever and Dario Amodei. 2020. Language Models are Few-Shot Learners. arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441620"},{"key":"e_1_3_2_1_7_1","volume-title":"Wide and Deep Learning for Recommender Systems. arXiv:1606.07792","author":"Cheng Heng-Tze","year":"2016","unstructured":"Heng-Tze Cheng , Levent Koc , Jeremiah Harmsen , Tal Shaked , Tushar Chandra , Hrishi Aradhye , Glen Anderson , Greg Corrado , Wei Chai , Mustafa Ispir , Rohan Anil , Zakaria Haque , Lichan Hong , Vihan Jain , Xiaobing Liu , and Hemal Shah . 2016. Wide and Deep Learning for Recommender Systems. arXiv:1606.07792 ( 2016 ). http:\/\/arxiv.org\/abs\/1606.07792 Heng-Tze Cheng, Levent Koc, Jeremiah Harmsen, Tal Shaked, Tushar Chandra, Hrishi Aradhye, Glen Anderson, Greg Corrado, Wei Chai, Mustafa Ispir, Rohan Anil, Zakaria Haque, Lichan Hong, Vihan Jain, Xiaobing Liu, and Hemal Shah. 2016. Wide and Deep Learning for Recommender Systems. arXiv:1606.07792 (2016). http:\/\/arxiv.org\/abs\/1606.07792"},{"key":"e_1_3_2_1_8_1","volume-title":"Xception: Deep Learning with Depthwise Separable Convolutions. arXiv:1610.02357 [cs.CV]","author":"Chollet Fran\u00e7ois","year":"2017","unstructured":"Fran\u00e7ois Chollet . 2017 . Xception: Deep Learning with Depthwise Separable Convolutions. arXiv:1610.02357 [cs.CV] Fran\u00e7ois Chollet. 2017. Xception: Deep Learning with Depthwise Separable Convolutions. arXiv:1610.02357 [cs.CV]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_10_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin , Ming-Wei Chang , Kenton Lee , and Kristina Toutanova . 2019 . BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL] Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 [cs.CL]"},{"key":"e_1_3_2_1_11_1","article-title":"Adaptive subgradient methods for online learning and stochastic optimization","volume":"12","author":"Duchi John","year":"2011","unstructured":"John Duchi , Elad Hazan , and Yoram Singer . 2011 . Adaptive subgradient methods for online learning and stochastic optimization . Journal of machine learning research 12 , 7 (2011). John Duchi, Elad Hazan, and Yoram Singer. 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research 12, 7 (2011).","journal-title":"Journal of machine learning research"},{"key":"e_1_3_2_1_12_1","volume-title":"Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Murali Annavaram, Krishnakumar Nair, and Misha Smelyanskiy.","author":"Eisenman Assaf","year":"2020","unstructured":"Assaf Eisenman , Kiran Kumar Matam , Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Murali Annavaram, Krishnakumar Nair, and Misha Smelyanskiy. 2020 . Check-N-Run: A Checkpointing System for Training Recommendation Models . arXiv:2010.08679 [cs.IR] Assaf Eisenman, Kiran Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Murali Annavaram, Krishnakumar Nair, and Misha Smelyanskiy. 2020. Check-N-Run: A Checkpointing System for Training Recommendation Models. arXiv:2010.08679 [cs.IR]"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2843948"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/2627435.2638582"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377 [cs.DC]  Aaron Harlap Deepak Narayanan Amar Phanishayee Vivek Seshadri Nikhil Devanur Greg Ganger and Phil Gibbons. 2018. PipeDream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377 [cs.DC]","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_18_1","unstructured":"Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv:1512.03385 [cs.CV]  Kaiming He Xiangyu Zhang Shaoqing Ren and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv:1512.03385 [cs.CV]"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/2648584.2648589"},{"key":"e_1_3_2_1_21_1","volume-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization. arXiv preprint arXiv:1910.02653","author":"Jain Paras","year":"2019","unstructured":"Paras Jain , Ajay Jain , Aniruddha Nrusimha , Amir Gholami , Pieter Abbeel , Kurt Keutzer , Ion Stoica , and Joseph E Gonzalez . 2019 . Checkmate: Breaking the memory wall with optimal tensor rematerialization. arXiv preprint arXiv:1910.02653 (2019). Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Kurt Keutzer, Ion Stoica, and Joseph E Gonzalez. 2019. Checkmate: Breaking the memory wall with optimal tensor rematerialization. arXiv preprint arXiv:1910.02653 (2019)."},{"key":"e_1_3_2_1_22_1","volume-title":"Beyond data and model parallelism for deep neural networks. arXiv preprint arXiv:1807.05358","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia , Matei Zaharia , and Alex Aiken . 2018. Beyond data and model parallelism for deep neural networks. arXiv preprint arXiv:1807.05358 ( 2018 ). Zhihao Jia, Matei Zaharia, and Alex Aiken. 2018. Beyond data and model parallelism for deep neural networks. arXiv preprint arXiv:1807.05358 (2018)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3326937.3341255"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433758"},{"key":"e_1_3_2_1_26_1","volume-title":"Karp","author":"Karmarker Narenda","year":"1983","unstructured":"Narenda Karmarker and Richard M . Karp . 1983 . The Differencing Method of Set Partitioning. Technical Report. USA. Narenda Karmarker and Richard M. Karp. 1983. The Differencing Method of Set Partitioning. Technical Report. USA."},{"key":"e_1_3_2_1_27_1","volume-title":"Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980","author":"Kingma Diederik P","year":"2014","unstructured":"Diederik P Kingma and Jimmy Ba . 2014 . Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014). Diederik P Kingma and Jimmy Ba. 2014. Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_28_1","volume-title":"Dynamic tensor rematerialization. arXiv preprint arXiv:2006.09616","author":"Kirisame Marisa","year":"2020","unstructured":"Marisa Kirisame , Steven Lyubomirsky , Altan Haan , Jennifer Brennan , Mike He , Jared Roesch , Tianqi Chen , and Zachary Tatlock . 2020. Dynamic tensor rematerialization. arXiv preprint arXiv:2006.09616 ( 2020 ). Marisa Kirisame, Steven Lyubomirsky, Altan Haan, Jennifer Brennan, Mike He, Jared Roesch, Tianqi Chen, and Zachary Tatlock. 2020. Dynamic tensor rematerialization. arXiv preprint arXiv:2006.09616 (2020)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2009.263"},{"key":"e_1_3_2_1_30_1","unstructured":"ChonLam Lao Yanfang Le Kshiteej Mahajan Yixi Chen Wenfei Wu Aditya Akella and Michael Swift. [n.d.]. ATP: In-network Aggregation for Multi-tenant Learning. ([n. d.]).  ChonLam Lao Yanfang Le Kshiteej Mahajan Yixi Chen Wenfei Wu Aditya Akella and Michael Swift. [n.d.]. ATP: In-network Aggregation for Multi-tenant Learning. ([n. d.])."},{"key":"e_1_3_2_1_31_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin , HyoukJoong Lee , Yuanzhong Xu , Dehao Chen , Orhan Firat , Yanping Huang , Maxim Krikun , Noam Shazeer , and Zhifeng Chen . 2020 . Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020). Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. 2020. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668 (2020)."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.14778\/3415478.3415530"},{"key":"e_1_3_2_1_33_1","volume-title":"3LC: Lightweight and Effective Traffic Compression for Distributed Machine Learning. arXiv preprint arXiv:1802.07389","author":"Lim Hyeontaek","year":"2018","unstructured":"Hyeontaek Lim , David G Andersen , and Michael Kaminsky . 2018. 3LC: Lightweight and Effective Traffic Compression for Distributed Machine Learning. arXiv preprint arXiv:1802.07389 ( 2018 ). Hyeontaek Lim, David G Andersen, and Michael Kaminsky. 2018. 3LC: Lightweight and Effective Traffic Compression for Distributed Machine Learning. arXiv preprint arXiv:1802.07389 (2018)."},{"key":"e_1_3_2_1_34_1","volume-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887","author":"Lin Yujun","year":"2017","unstructured":"Yujun Lin , Song Han , Huizi Mao , Yu Wang , and William J Dally . 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 ( 2017 ). Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J Dally. 2017. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 (2017)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i10.17058"},{"key":"e_1_3_2_1_36_1","unstructured":"Liang Luo Peter West Arvind Krishnamurthy Luis Ceze and Jacob Nelson. 2020. PLink: Discovering and Exploiting Datacenter Network Locality for Efficient Cloud-based Distributed Training.  Liang Luo Peter West Arvind Krishnamurthy Luis Ceze and Jacob Nelson. 2020. PLink: Discovering and Exploiting Datacenter Network Locality for Efficient Cloud-based Distributed Training."},{"key":"e_1_3_2_1_37_1","volume-title":"Temporal-Contextual Recommendation in Real-Time (KDD '20)","author":"Ma Yifei","unstructured":"Yifei Ma , Balakrishnan (Murali) Narayanaswamy , Haibin Lin , and Hao Ding . 2020. Temporal-Contextual Recommendation in Real-Time (KDD '20) . Association for Computing Machinery , New York, NY, USA , 2291--2299. Yifei Ma, Balakrishnan (Murali) Narayanaswamy, Haibin Lin, and Hao Ding. 2020. Temporal-Contextual Recommendation in Real-Time (KDD '20). Association for Computing Machinery, New York, NY, USA, 2291--2299."},{"key":"e_1_3_2_1_38_1","volume-title":"Taylor Robie, Tom St. John, Tsuguchika Tabaru, Carole-Jean Wu, Lingjie Xu, Masafumi Yamazaki, Cliff Young, and Matei Zaharia.","author":"Mattson Peter","year":"2020","unstructured":"Peter Mattson , Christine Cheng , Cody Coleman , Greg Diamos , Paulius Micikevicius , David Patterson , Hanlin Tang , Gu-Yeon Wei , Peter Bailis , Victor Bittorf , David Brooks , Dehao Chen , Debojyoti Dutta , Udit Gupta , Kim Hazelwood , Andrew Hock , Xinyuan Huang , Atsushi Ike , Bill Jia , Daniel Kang , David Kanter , Naveen Kumar , Jeffery Liao , Guokai Ma , Deepak Narayanan , Tayo Oguntebi , Gennady Pekhimenko , Lillian Pentecost , Vijay Janapa Reddi , Taylor Robie, Tom St. John, Tsuguchika Tabaru, Carole-Jean Wu, Lingjie Xu, Masafumi Yamazaki, Cliff Young, and Matei Zaharia. 2020 . MLPerf Training Benchmark . arXiv:1910.01500 [cs.LG] Peter Mattson, Christine Cheng, Cody Coleman, Greg Diamos, Paulius Micikevicius, David Patterson, Hanlin Tang, Gu-Yeon Wei, Peter Bailis, Victor Bittorf, David Brooks, Dehao Chen, Debojyoti Dutta, Udit Gupta, Kim Hazelwood, Andrew Hock, Xinyuan Huang, Atsushi Ike, Bill Jia, Daniel Kang, David Kanter, Naveen Kumar, Jeffery Liao, Guokai Ma, Deepak Narayanan, Tayo Oguntebi, Gennady Pekhimenko, Lillian Pentecost, Vijay Janapa Reddi, Taylor Robie, Tom St. John, Tsuguchika Tabaru, Carole-Jean Wu, Lingjie Xu, Masafumi Yamazaki, Cliff Young, and Matei Zaharia. 2020. MLPerf Training Benchmark. arXiv:1910.01500 [cs.LG]"},{"key":"e_1_3_2_1_39_1","unstructured":"Azalia Mirhoseini Hieu Pham Quoc Le Mohammad Norouzi Samy Bengio Benoit Steiner Yuefeng Zhou Naveen Kumar Rasmus Larsen and Jeff Dean. 2017. Device Placement Optimization with Reinforcement Learning. https:\/\/arxiv.org\/abs\/1706.04972  Azalia Mirhoseini Hieu Pham Quoc Le Mohammad Norouzi Samy Bengio Benoit Steiner Yuefeng Zhou Naveen Kumar Rasmus Larsen and Jeff Dean. 2017. Device Placement Optimization with Reinforcement Learning. https:\/\/arxiv.org\/abs\/1706.04972"},{"key":"e_1_3_2_1_40_1","volume-title":"2019 OCP Regional Summit","author":"Mudigere Dheevatsa","year":"2019","unstructured":"Dheevatsa Mudigere and Whitney Zhao . 2019. HW\/ SW Co-design for future AI platforms - Large memory unified training platform (Zion). In 2019 OCP Regional Summit , Amsterdam . https:\/\/ 2019 ocpregionalsummit.sched.com\/event\/Qyge Dheevatsa Mudigere and Whitney Zhao. 2019. HW\/SW Co-design for future AI platforms - Large memory unified training platform (Zion). In 2019 OCP Regional Summit, Amsterdam. https:\/\/2019ocpregionalsummit.sched.com\/event\/Qyge"},{"key":"e_1_3_2_1_41_1","volume-title":"Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan , Mohammad Shoeybi , Jared Casper , Patrick LeGresley , Mostofa Patwary , Vijay Korthikanti , Dmitri Vainbrand , Prethvi Kashinkunti , Julie Bernauer , Bryan Catanzaro , Amar Phanishayee , and Matei Zaharia . 2021. Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473 ( 2021 ). arXiv:2104.04473 https:\/\/arxiv.org\/abs\/2104.04473 Deepak Narayanan, Mohammad Shoeybi, Jared Casper, Patrick LeGresley, Mostofa Patwary, Vijay Korthikanti, Dmitri Vainbrand, Prethvi Kashinkunti, Julie Bernauer, Bryan Catanzaro, Amar Phanishayee, and Matei Zaharia. 2021. Efficient Large-Scale Language Model Training on GPU Clusters. CoRR abs\/2104.04473 (2021). arXiv:2104.04473 https:\/\/arxiv.org\/abs\/2104.04473"},{"key":"e_1_3_2_1_42_1","unstructured":"Maxim Naumov John Kim Dheevatsa Mudigere Srinivas Sridharan Xiaodong Wang Whitney Zhao Serhat Yilmaz Changkyu Kim Hector Yuen Mustafa Ozdal Krishnakumar Nair Isabel Gao Bor-Yiing Su Jiyan Yang and Mikhail Smelyanskiy. 2020. Deep Learning Training in Facebook Data Centers: Design of Scale-up and Scale-out Systems. arXiv:2003.09518 [cs.DC]  Maxim Naumov John Kim Dheevatsa Mudigere Srinivas Sridharan Xiaodong Wang Whitney Zhao Serhat Yilmaz Changkyu Kim Hector Yuen Mustafa Ozdal Krishnakumar Nair Isabel Gao Bor-Yiing Su Jiyan Yang and Mikhail Smelyanskiy. 2020. Deep Learning Training in Facebook Data Centers: Design of Scale-up and Scale-out Systems. arXiv:2003.09518 [cs.DC]"},{"key":"e_1_3_2_1_43_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091 (2019). https:\/\/arxiv.org\/abs\/1906.00091  Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091 (2019). https:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_2_1_44_1","unstructured":"NVIDIA. 2021. Unified Memory in CUDA 6. https:\/\/developer.nvidia.com\/blog\/unified-memory-in-cuda-6\/. Accessed: 2021-03-31.  NVIDIA. 2021. Unified Memory in CUDA 6. https:\/\/developer.nvidia.com\/blog\/unified-memory-in-cuda-6\/. Accessed: 2021-03-31."},{"key":"e_1_3_2_1_45_1","unstructured":"OpenAI. 2018. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/.  OpenAI. 2018. AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/."},{"key":"e_1_3_2_1_46_1","volume-title":"19th USENIX Conference on File and Storage Technologies (FAST 21)","author":"Pan Satadru","year":"2021","unstructured":"Satadru Pan , Theano Stavrinos , Yunqiao Zhang , Atul Sikaria , Pavel Zakharov , Abhinav Sharma , Shiva Shankar P, Mike Shuey , Richard Wareing , Monika Gangapuram , Guanglei Cao , Christian Preseau , Pratap Singh , Kestutis Patiejunas , JR Tipton , Ethan Katz-Bassett , and Wyatt Lloyd . 2021 . Facebook's Tectonic Filesystem: Efficiency from Exascale . In 19th USENIX Conference on File and Storage Technologies (FAST 21) . USENIX Association, 217--231. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/pan Satadru Pan, Theano Stavrinos, Yunqiao Zhang, Atul Sikaria, Pavel Zakharov, Abhinav Sharma, Shiva Shankar P, Mike Shuey, Richard Wareing, Monika Gangapuram, Guanglei Cao, Christian Preseau, Pratap Singh, Kestutis Patiejunas, JR Tipton, Ethan Katz-Bassett, and Wyatt Lloyd. 2021. Facebook's Tectonic Filesystem: Efficiency from Exascale. In 19th USENIX Conference on File and Storage Technologies (FAST 21). USENIX Association, 217--231. https:\/\/www.usenix.org\/conference\/fast21\/presentation\/pan"},{"key":"e_1_3_2_1_47_1","unstructured":"Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Khudia James Law Parth Malani Andrey Malevich Satish Nadathur Juan Pino Martin Schatz Alexander Sidorov Viswanath Sivakumar Andrew Tulloch Xiaodong Wang Yiming Wu Hector Yuen Utku Diril Dmytro Dzhulgakov Kim Hazelwood Bill Jia Yangqing Jia Lin Qiao Vijay Rao Nadav Rotem Sungjoo Yoo and Mikhail Smelyanskiy. 2018. Deep Learning Inference in Facebook Data Centers: Characterization Performance Optimizations and Hardware Implications. arXiv:1811.09886 [cs.LG]  Jongsoo Park Maxim Naumov Protonu Basu Summer Deng Aravind Kalaiah Daya Khudia James Law Parth Malani Andrey Malevich Satish Nadathur Juan Pino Martin Schatz Alexander Sidorov Viswanath Sivakumar Andrew Tulloch Xiaodong Wang Yiming Wu Hector Yuen Utku Diril Dmytro Dzhulgakov Kim Hazelwood Bill Jia Yangqing Jia Lin Qiao Vijay Rao Nadav Rotem Sungjoo Yoo and Mikhail Smelyanskiy. 2018. Deep Learning Inference in Facebook Data Centers: Characterization Performance Optimizations and Hardware Implications. arXiv:1811.09886 [cs.LG]"},{"key":"e_1_3_2_1_48_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke , Sam Gross , Francisco Massa , Adam Lerer , James Bradbury , Gregory Chanan , Trevor Killeen , Zeming Lin , Natalia Gimelshein , Luca Antiga , 2019 . Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037. Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. 2019. Pytorch: An imperative style, high-performance deep learning library. In Advances in neural information processing systems. 8026--8037."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359642"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_51_1","volume-title":"A lock-free approach to parallelizing stochastic gradient descent. Advances in neural information processing systems 24","author":"Recht Benjamin","year":"2011","unstructured":"Benjamin Recht , Christopher Re , Stephen Wright , and Feng Niu . 2011. Hogwild! : A lock-free approach to parallelizing stochastic gradient descent. Advances in neural information processing systems 24 ( 2011 ), 693--701. Benjamin Recht, Christopher Re, Stephen Wright, and Feng Niu. 2011. Hogwild!: A lock-free approach to parallelizing stochastic gradient descent. Advances in neural information processing systems 24 (2011), 693--701."},{"key":"e_1_3_2_1_52_1","volume-title":"MLPerf Inference Benchmark. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 446--459","author":"Reddi V. J.","unstructured":"V. J. Reddi , C. Cheng , D. Kanter , P. Mattson , G. Schmuelling , C. Wu , B. Anderson , M. Breughe , M. Charlebois , W. Chou , R. Chukka , C. Coleman , S. Davis , P. Deng , G. Diamos , J. Duke , D. Fick , J. S. Gardner , I. Hubara , S. Idgunji , T. B. Jablin , J. Jiao , T. S. John , P. Kanwar , D. Lee , J. Liao , A. Lokhmotov , F. Massa , P. Meng , P. Micikevicius , C. Osborne , G. Pekhimenko , A. T. R. Rajan , D. Sequeira , A. Sirasao , F. Sun , H. Tang , M. Thomson , F. Wei , E. Wu , L. Xu , K. Yamada , B. Yu , G. Yuan , A. Zhong , P. Zhang , and Y. Zhou . 2020 . MLPerf Inference Benchmark. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 446--459 . V. J. Reddi, C. Cheng, D. Kanter, P. Mattson, G. Schmuelling, C. Wu, B. Anderson, M. Breughe, M. Charlebois, W. Chou, R. Chukka, C. Coleman, S. Davis, P. Deng, G. Diamos, J. Duke, D. Fick, J. S. Gardner, I. Hubara, S. Idgunji, T. B. Jablin, J. Jiao, T. S. John, P. Kanwar, D. Lee, J. Liao, A. Lokhmotov, F. Massa, P. Meng, P. Micikevicius, C. Osborne, G. Pekhimenko, A. T. R. Rajan, D. Sequeira, A. Sirasao, F. Sun, H. Tang, M. Thomson, F. Wei, E. Wu, L. Xu, K. Yamada, B. Yu, G. Yuan, A. Zhong, P. Zhang, and Y. Zhou. 2020. MLPerf Inference Benchmark. In 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA). 446--459."},{"key":"e_1_3_2_1_53_1","volume-title":"Dan RK Ports, and Peter Richt\u00e1rik","author":"Sapio Amedeo","year":"2019","unstructured":"Amedeo Sapio , Marco Canini , Chen-Yu Ho , Jacob Nelson , Panos Kalnis , Changhoon Kim , Arvind Krishnamurthy , Masoud Moshref , Dan RK Ports, and Peter Richt\u00e1rik . 2019 . Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701 (2019). Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan RK Ports, and Peter Richt\u00e1rik. 2019. Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701 (2019)."},{"key":"e_1_3_2_1_54_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso . 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 ( 2018 ). Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_55_1","unstructured":"David Silver Thomas Hubert Julian Schrittwieser Ioannis Antonoglou Matthew Lai Arthur Guez Marc Lanctot Laurent Sifre Dharshan Kumaran Thore Graepel Timothy Lillicrap Karen Simonyan and Demis Hassabis. 2017. Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm. arXiv:1712.01815 [cs.AI]  David Silver Thomas Hubert Julian Schrittwieser Ioannis Antonoglou Matthew Lai Arthur Guez Marc Lanctot Laurent Sifre Dharshan Kumaran Thore Graepel Timothy Lillicrap Karen Simonyan and Demis Hassabis. 2017. Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm. arXiv:1712.01815 [cs.AI]"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1038\/nature24270"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875650"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2017.72"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"crossref","unstructured":"Christian Szegedy Wei Liu Yangqing Jia Pierre Sermanet Scott Reed Dragomir Anguelov Dumitru Erhan Vincent Vanhoucke and Andrew Rabinovich. 2014. Going Deeper with Convolutions. arXiv:1409.4842 [cs.CV]  Christian Szegedy Wei Liu Yangqing Jia Pierre Sermanet Scott Reed Dragomir Anguelov Dumitru Erhan Vincent Vanhoucke and Andrew Rabinovich. 2014. Going Deeper with Convolutions. arXiv:1409.4842 [cs.CV]","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_2_1_60_1","volume-title":"Efficient algorithms for device placement of dnn graph operators. arXiv preprint arXiv:2006.16423","author":"Tarnawski Jakub","year":"2020","unstructured":"Jakub Tarnawski , Amar Phanishayee , Nikhil R Devanur , Divya Mahajan , and Fanny Nina Paravecino . 2020. Efficient algorithms for device placement of dnn graph operators. arXiv preprint arXiv:2006.16423 ( 2020 ). Jakub Tarnawski, Amar Phanishayee, Nikhil R Devanur, Divya Mahajan, and Fanny Nina Paravecino. 2020. Efficient algorithms for device placement of dnn graph operators. arXiv preprint arXiv:2006.16423 (2020)."},{"key":"e_1_3_2_1_61_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. arXiv:1706.03762 [cs.CL]  Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan N. Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention Is All You Need. arXiv:1706.03762 [cs.CL]"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00025"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.2010.11305"},{"key":"e_1_3_2_1_64_1","volume-title":"Ping Tak Peter Tang, and Andrew Tul- loch","author":"Yang Jie Amy","year":"2020","unstructured":"Jie Amy Yang , Jianyu Huang , Jongsoo Park , Ping Tak Peter Tang, and Andrew Tul- loch . 2020 . Mixed-Precision Embedding Using a Cache . arXiv:2010.11305 [cs.LG] Jie Amy Yang, Jianyu Huang, Jongsoo Park, Ping Tak Peter Tang, and Andrew Tul- loch. 2020. Mixed-Precision Embedding Using a Cache. arXiv:2010.11305 [cs.LG]"},{"key":"e_1_3_2_1_65_1","unstructured":"Jie Amy Yang Jongsoo Park Srinivas Sridharan and Ping Tak Peter Tang. 2020. Training Deep Learning Recommendation Model with Quantized Collective Communications. (2020).  Jie Amy Yang Jongsoo Park Srinivas Sridharan and Ping Tak Peter Tang. 2020. Training Deep Learning Recommendation Model with Quantized Collective Communications. (2020)."},{"key":"e_1_3_2_1_66_1","volume-title":"Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962","author":"You Yang","year":"2019","unstructured":"Yang You , Jing Li , Sashank Reddi , Jonathan Hseu , Sanjiv Kumar , Srinadh Bhojanapalli , Xiaodan Song , James Demmel , Kurt Keutzer , and Cho-Jui Hsieh . 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962 ( 2019 ). Yang You, Jing Li, Sashank Reddi, Jonathan Hseu, Sanjiv Kumar, Srinadh Bhojanapalli, Xiaodan Song, James Demmel, Kurt Keutzer, and Cho-Jui Hsieh. 2019. Large batch optimization for deep learning: Training bert in 76 minutes. arXiv preprint arXiv:1904.00962 (2019)."},{"key":"e_1_3_2_1_67_1","volume-title":"Systems for Machine Learning Workshop at NeurIPS","volume":"2018","author":"Zhang Jian","year":"2018","unstructured":"Jian Zhang , Jiyan Yang , and Hector Yuen . 2018 . Training with low-precision embedding tables . In Systems for Machine Learning Workshop at NeurIPS , Vol. 2018 . Jian Zhang, Jiyan Yang, and Hector Yuen. 2018. Training with low-precision embedding tables. In Systems for Machine Learning Workshop at NeurIPS, Vol. 2018."},{"key":"e_1_3_2_1_68_1","volume-title":"Deep learning with elastic averaging SGD. Advances in neural information processing systems 28","author":"Zhang Sixin","year":"2015","unstructured":"Sixin Zhang , Anna E Choromanska , and Yann LeCun . 2015. Deep learning with elastic averaging SGD. Advances in neural information processing systems 28 ( 2015 ), 685--693. Sixin Zhang, Anna E Choromanska, and Yann LeCun. 2015. Deep learning with elastic averaging SGD. Advances in neural information processing systems 28 (2015), 685--693."},{"key":"e_1_3_2_1_69_1","volume-title":"Accelerator Fabric in Facebook Zion Training System. In 2019 IEEE\/ACM International Symposium on Networks-on-Chip (NOCS).","author":"Zhao Whiteny","year":"2019","unstructured":"Whiteny Zhao , Dheevatsa Mudigere , Xiaodong Wang , Jongsoo Park , John Kim , and Mikhail Smelyanskiy . 2019 . Accelerator Fabric in Facebook Zion Training System. In 2019 IEEE\/ACM International Symposium on Networks-on-Chip (NOCS). Whiteny Zhao, Dheevatsa Mudigere, Xiaodong Wang, Jongsoo Park, John Kim, and Mikhail Smelyanskiy. 2019. Accelerator Fabric in Facebook Zion Training System. In 2019 IEEE\/ACM International Symposium on Networks-on-Chip (NOCS)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3358045"},{"key":"e_1_3_2_1_71_1","volume-title":"ShadowSync: Performing Synchronization in the Background for Highly Scalable Distributed Training. CoRR","author":"Zheng Qinqing","year":"2003","unstructured":"Qinqing Zheng , Bor-Yiing Su , Jiyan Yang , Alisson Azzolini , Qiang Wu , Ou Jin , Shri Karandikar , Hagay Lupesko , Liang Xiong , and Eric Zhou . 2020. ShadowSync: Performing Synchronization in the Background for Highly Scalable Distributed Training. CoRR 2003 .03477 (2020). Qinqing Zheng, Bor-Yiing Su, Jiyan Yang, Alisson Azzolini, Qiang Wu, Ou Jin, Shri Karandikar, Hagay Lupesko, Liang Xiong, and Eric Zhou. 2020. ShadowSync: Performing Synchronization in the Background for Highly Scalable Distributed Training. CoRR 2003.03477 (2020)."}],"event":{"name":"ISCA '22: The 49th Annual International Symposium on Computer Architecture","location":"New York New York","acronym":"ISCA '22","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","IEEE CS TCAA IEEE CS technical committee on architectural acoustics"]},"container-title":["Proceedings of the 49th Annual International Symposium on Computer Architecture"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3533727","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3470496.3533727","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:18:54Z","timestamp":1750191534000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3470496.3533727"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,6,11]]},"references-count":71,"alternative-id":["10.1145\/3470496.3533727","10.1145\/3470496"],"URL":"https:\/\/doi.org\/10.1145\/3470496.3533727","relation":{},"subject":[],"published":{"date-parts":[[2022,6,11]]},"assertion":[{"value":"2022-06-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}