{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,6]],"date-time":"2026-06-06T04:04:47Z","timestamp":1780718687127,"version":"3.54.1"},"publisher-location":"New York, NY, USA","reference-count":147,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,9]],"date-time":"2021-08-09T00:00:00Z","timestamp":1628467200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"DARPA FastNICs"},{"name":"MachineLearningApplications@CSAIL Award"},{"name":"DARPA PIPES"},{"name":"SystemsThatLearn@CSAIL Ignite Grant"},{"name":"NSF","award":["ASCENT-2023468, CNS-2008624, CNS-1751009, CNS-2006827, CNS-1563826"],"award-info":[{"award-number":["ASCENT-2023468, CNS-2008624, CNS-1751009, CNS-2006827, CNS-1563826"]}]},{"name":"Cisco Research Center Award"},{"name":"AEPA-E ENLITENED PINE"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,9]]},"DOI":"10.1145\/3452296.3472900","type":"proceedings-article","created":{"date-parts":[[2021,8,9]],"date-time":"2021-08-09T18:13:15Z","timestamp":1628532795000},"page":"657-675","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":92,"title":["SiP-ML"],"prefix":"10.1145","author":[{"given":"Mehrdad","family":"Khani","sequence":"first","affiliation":[{"name":"MIT"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Manya","family":"Ghobadi","sequence":"additional","affiliation":[{"name":"MIT"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohammad","family":"Alizadeh","sequence":"additional","affiliation":[{"name":"MIT"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ziyi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Columbia University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Madeleine","family":"Glick","sequence":"additional","affiliation":[{"name":"Columbia University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Keren","family":"Bergman","sequence":"additional","affiliation":[{"name":"Columbia University"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Amin","family":"Vahdat","sequence":"additional","affiliation":[{"name":"Google"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Benjamin","family":"Klenk","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Eiman","family":"Ebrahimi","sequence":"additional","affiliation":[{"name":"NVIDIA"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2021,8,9]]},"reference":[{"key":"e_1_3_2_2_1_1","unstructured":"AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/.  AI and Compute. https:\/\/openai.com\/blog\/ai-and-compute\/."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2019.2947013"},{"key":"e_1_3_2_2_3_1","unstructured":"Siddharth Das. CNN Architectures 2017.  Siddharth Das. CNN Architectures 2017."},{"key":"e_1_3_2_2_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_2_5_1","unstructured":"NVIDIA DGX A100. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-a100\/.  NVIDIA DGX A100. https:\/\/www.nvidia.com\/en-us\/data-center\/dgx-a100\/."},{"key":"e_1_3_2_2_6_1","unstructured":"NVIDIA Selene Cluster. https:\/\/blogs.nvidia.com\/blog\/2020\/12\/18\/nvidia-selene-busy\/.  NVIDIA Selene Cluster. https:\/\/blogs.nvidia.com\/blog\/2020\/12\/18\/nvidia-selene-busy\/."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"crossref","unstructured":"S S Vazhkudai B R de Supinski A S Bland A Geist J Sexton J Kahle C J Zimmer S Atchley S H Oral D E Maxwell V G Vergara Larrea A Bertsch R Goldstone W Joubert C Chambreau D Appelhans R Blackmore B Casses G Chochia G Davison M A Ezell E Gonsiorowski L Grinberg B Hanson B Hartner I Karlin M L Leininger D Leverman C Marroquin A Moody M Ohmacht R Pankajakshan F Pizzano J H Rogers B Rosenburg D Schmidt M Shankar F Wang P Watson B Walkup L D Weems and J Yin. The design deployment and evaluation of the coral pre-exascale systems. 7 2018.  S S Vazhkudai B R de Supinski A S Bland A Geist J Sexton J Kahle C J Zimmer S Atchley S H Oral D E Maxwell V G Vergara Larrea A Bertsch R Goldstone W Joubert C Chambreau D Appelhans R Blackmore B Casses G Chochia G Davison M A Ezell E Gonsiorowski L Grinberg B Hanson B Hartner I Karlin M L Leininger D Leverman C Marroquin A Moody M Ohmacht R Pankajakshan F Pizzano J H Rogers B Rosenburg D Schmidt M Shankar F Wang P Watson B Walkup L D Weems and J Yin. The design deployment and evaluation of the coral pre-exascale systems. 7 2018.","DOI":"10.1109\/SC.2018.00055"},{"key":"e_1_3_2_2_8_1","volume-title":"July","author":"Coffey Valerie","year":"2020","unstructured":"Valerie Coffey . DARPA PIPES Program demonstrates 2 Tbit\/s optical interconnects at the chip level , July 2020 . https:\/\/www.laserfocusworld.com\/fiber-optics\/article\/14176186\/darpa-pipes-program-demonstrates-2-tbits-optical-interconnects-at-the-chip-level. Valerie Coffey. DARPA PIPES Program demonstrates 2 Tbit\/s optical interconnects at the chip level, July 2020. https:\/\/www.laserfocusworld.com\/fiber-optics\/article\/14176186\/darpa-pipes-program-demonstrates-2-tbits-optical-interconnects-at-the-chip-level."},{"key":"e_1_3_2_2_9_1","volume-title":"Optical i\/o chiplets eliminate bottlenecks to unleash innovation","author":"Wade Mark","year":"2020","unstructured":"Mark Wade . Optical i\/o chiplets eliminate bottlenecks to unleash innovation , 2020 . https:\/\/ayarlabs.com\/ayar-labs-solving-critical-computing-challenges-through-optical-i-o\/. Mark Wade. Optical i\/o chiplets eliminate bottlenecks to unleash innovation, 2020. https:\/\/ayarlabs.com\/ayar-labs-solving-critical-computing-challenges-through-optical-i-o\/."},{"key":"e_1_3_2_2_10_1","first-page":"1","volume-title":"Silicon Optical Interposers for High-Density Optical Interconnects","author":"Urino Yutaka","year":"2016","unstructured":"Yutaka Urino , Takahiro Nakamura , and Yasuhiko Arakawa . Silicon Optical Interposers for High-Density Optical Interconnects , pages 1 -- 39 . Springer Berlin Heidelberg , Berlin, Heidelberg , 2016 . Yutaka Urino, Takahiro Nakamura, and Yasuhiko Arakawa. Silicon Optical Interposers for High-Density Optical Interconnects, pages 1--39. Springer Berlin Heidelberg, Berlin, Heidelberg, 2016."},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/EPTC.2017.8277464"},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/JLT.2011.2159260"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1088\/2040-8978\/18\/7\/073003"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"crossref","unstructured":"M. Wade M. Davenport M. De Cea Falco P. Bhargava J. Fini D. Van Orden R. Meade E. Yeung R. Ram M. Popovic V. Stojanovic and C. Sun. A bandwidth-dense low power electronic-photonic platform and architecture for multi-tbps optical i\/o. pages 1--3 Sep. 2018.  M. Wade M. Davenport M. De Cea Falco P. Bhargava J. Fini D. Van Orden R. Meade E. Yeung R. Ram M. Popovic V. Stojanovic and C. Sun. A bandwidth-dense low power electronic-photonic platform and architecture for multi-tbps optical i\/o. pages 1--3 Sep. 2018.","DOI":"10.1109\/ECOC.2018.8535563"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2013.1"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1002\/0470014180"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1364\/OPTICA.5.001354"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1364\/OPN.29.3.000036"},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-018-0028-z"},{"key":"e_1_3_2_2_20_1","volume-title":"HotChips","author":"Wade Mark","unstructured":"Mark Wade , Erik Anderson , Shahab Ardalan , Pavan Bhargava , Sidney Buchbinder , Michael Davenport , John Fini , Anatoly Khilo , Chandru Ramamurthy Roy Meade , Michael Rust , Vladimir Stojanovic Forrest Sedgwick , Derek Van Orden , Chong Zhang Edward Wang , Chen Sun , Sergey Shumarayev , Conor O'Keeffe , Tim T. Hoang , David Kehlet , Ravi V. Mahajan , Allen Chan , and Tina Tran . TeraPHY : A Chiplet Technology for Low-Power, High-Bandwidth Optical I\/O . HotChips , pages i--xlviii, August 2019. https:\/\/www.hotchips.org\/hc31\/HC31_2.9_AyarLabs_20190820_HC_FINAL.pdf. Mark Wade, Erik Anderson, Shahab Ardalan, Pavan Bhargava, Sidney Buchbinder, Michael Davenport, John Fini, Anatoly Khilo, Chandru Ramamurthy Roy Meade, Michael Rust, Vladimir Stojanovic Forrest Sedgwick, Derek Van Orden, Chong Zhang Edward Wang, Chen Sun, Sergey Shumarayev, Conor O'Keeffe, Tim T. Hoang, David Kehlet, Ravi V. Mahajan, Allen Chan, and Tina Tran. TeraPHY: A Chiplet Technology for Low-Power, High-Bandwidth Optical I\/O. HotChips, pages i--xlviii, August 2019. https:\/\/www.hotchips.org\/hc31\/HC31_2.9_AyarLabs_20190820_HC_FINAL.pdf."},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1364\/OE.23.004791"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1002\/lpor.201100017"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSTQE.2019.2911421"},{"key":"e_1_3_2_2_24_1","first-page":"339","volume-title":"SIGCOMM'10","author":"Farrington Nathan","unstructured":"Nathan Farrington , George Porter , Sivasankar Radhakrishnan , Hamid Hajabdolali Bazzaz , Vikram Subramanya , Yeshaiahu Fainman , George Papen , and Amin Vahdat . Helios : A hybrid electrical\/optical switch architecture for modular data centers . SIGCOMM'10 , pages 339 -- 350 . Nathan Farrington, George Porter, Sivasankar Radhakrishnan, Hamid Hajabdolali Bazzaz, Vikram Subramanya, Yeshaiahu Fainman, George Papen, and Amin Vahdat. Helios: A hybrid electrical\/optical switch architecture for modular data centers. SIGCOMM'10, pages 339--350."},{"key":"e_1_3_2_2_25_1","first-page":"327","volume-title":"SIGCOMM'10","author":"Wang Guohui","unstructured":"Guohui Wang , David G. Andersen , Michael Kaminsky , Konstantina Papagiannaki , T.S. Eugene Ng , Michael Kozuch , and Michael Ryan . c-Through : Part-time optics in data centers . SIGCOMM'10 , pages 327 -- 338 . Guohui Wang, David G. Andersen, Michael Kaminsky, Konstantina Papagiannaki, T.S. Eugene Ng, Michael Kozuch, and Michael Ryan. c-Through: Part-time optics in data centers. SIGCOMM'10, pages 327--338."},{"key":"e_1_3_2_2_26_1","first-page":"577","volume-title":"14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Chen Li","year":"2017","unstructured":"Li Chen , Kai Chen , Zhonghua Zhu , Minlan Yu , George Porter , Chunming Qiao , and Shan Zhong . Enabling wide-spread communications on optical fabric with megaswitch . In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17) , pages 577 -- 593 , Boston, MA , 2017 . USENIX Association. Li Chen, Kai Chen, Zhonghua Zhu, Minlan Yu, George Porter, Chunming Qiao, and Shan Zhong. Enabling wide-spread communications on optical fabric with megaswitch. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), pages 577--593, Boston, MA, 2017. USENIX Association."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.5555\/3020948.3021030"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_2_29_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"key":"e_1_3_2_2_30_1","unstructured":"Baidu 2017. https:\/\/github.com\/baidu-research\/baidu-allreduce.  Baidu 2017. https:\/\/github.com\/baidu-research\/baidu-allreduce."},{"key":"e_1_3_2_2_31_1","volume-title":"Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. CoRR, abs\/1807.11205","author":"Jia Xianyan","year":"2018","unstructured":"Xianyan Jia , Shutao Song , Wei He , Yangzihao Wang , Haidong Rong , Feihu Zhou , Liqiang Xie , Zhenyu Guo , Yuanzhou Yang , Liwei Yu , Tiegang Chen , Guangxiao Hu , Shaohuai Shi , and Xiaowen Chu . Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. CoRR, abs\/1807.11205 , 2018 . Xianyan Jia, Shutao Song, Wei He, Yangzihao Wang, Haidong Rong, Feihu Zhou, Liqiang Xie, Zhenyu Guo, Yuanzhou Yang, Liwei Yu, Tiegang Chen, Guangxiao Hu, Shaohuai Shi, and Xiaowen Chu. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. CoRR, abs\/1807.11205, 2018."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1023\/A:1022643204877"},{"key":"e_1_3_2_2_33_1","first-page":"2834","volume-title":"Advances in Neural Information Processing Systems 27","author":"Lee Seunghak","year":"2014","unstructured":"Seunghak Lee , Jin Kyu Kim , Xun Zheng , Qirong Ho , Garth A Gibson , and Eric P Xing . On model parallelization and scheduling strategies for distributed machine learning. In Z. Ghahramani, M. Welling, C. Cortes, N. D. Lawrence, and K. Q. Weinberger, editors , Advances in Neural Information Processing Systems 27 , pages 2834 -- 2842 . Curran Associates, Inc. , 2014 . Seunghak Lee, Jin Kyu Kim, Xun Zheng, Qirong Ho, Garth A Gibson, and Eric P Xing. On model parallelization and scheduling strategies for distributed machine learning. In Z. Ghahramani, M. Welling, C. Cortes, N. D. Lawrence, and K. Q. Weinberger, editors, Advances in Neural Information Processing Systems 27, pages 2834--2842. Curran Associates, Inc., 2014."},{"key":"e_1_3_2_2_34_1","unstructured":"Zhihao\n      Jia Sina\n      Lin Charles R.\n      Qi and \n      Alex\n      Aiken\n    .\n  Exploring hidden dimensions in accelerating convolutional neural networks\n  . volume \n  80\n   of \n  Proceedings of Machine Learning Research pages \n  2274\n  --\n  2283 Stockholmsm\u00e4ssan Stockholm Sweden 10--15 Jul \n  2018\n  . \n  PMLR.  Zhihao Jia Sina Lin Charles R. Qi and Alex Aiken. Exploring hidden dimensions in accelerating convolutional neural networks. volume 80 of Proceedings of Machine Learning Research pages 2274--2283 Stockholmsm\u00e4ssan Stockholm Sweden 10--15 Jul 2018. PMLR."},{"key":"e_1_3_2_2_35_1","volume-title":"Demystifying parallel and distributed deep learning: An in-depth concurrency analysis. CoRR, abs\/1802.09941","author":"BenNun Tal","year":"2018","unstructured":"Tal BenNun and Torsten Hoefler . Demystifying parallel and distributed deep learning: An in-depth concurrency analysis. CoRR, abs\/1802.09941 , 2018 . Tal BenNun and Torsten Hoefler. Demystifying parallel and distributed deep learning: An in-depth concurrency analysis. CoRR, abs\/1802.09941, 2018."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00036"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356207"},{"key":"e_1_3_2_2_38_1","volume-title":"SysML","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia , Matei Zaharia , and Alex Aiken . Beyond data and model parallelism for deep neural networks . SysML , 2019 . Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. SysML, 2019."},{"key":"e_1_3_2_2_39_1","first-page":"1223","volume-title":"Advances in Neural Information Processing Systems 25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean , Greg Corrado , Rajat Monga , Kai Chen , Matthieu Devin , Mark Mao , Marc aurelio Ranzato , Andrew Senior , Paul Tucker , Ke Yang , Quoc V. Le , and Andrew Y. Ng . Large scale distributed deep networks. In F. Pereira, C. J. C. Burges, L. Bottou, and K. Q. Weinberger, editors , Advances in Neural Information Processing Systems 25 , pages 1223 -- 1231 . Curran Associates, Inc. , 2012 . Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, Quoc V. Le, and Andrew Y. Ng. Large scale distributed deep networks. In F. Pereira, C. J. C. Burges, L. Bottou, and K. Q. Weinberger, editors, Advances in Neural Information Processing Systems 25, pages 1223--1231. Curran Associates, Inc., 2012."},{"key":"e_1_3_2_2_40_1","volume-title":"Integrated model and data parallelism in training neural networks. CoRR, abs\/1712.04432","author":"Gholami Amir","year":"2017","unstructured":"Amir Gholami , Ariful Azad , Kurt Keutzer , and Aydin Bulu\u00e7 . Integrated model and data parallelism in training neural networks. CoRR, abs\/1712.04432 , 2017 . Amir Gholami, Ariful Azad, Kurt Keutzer, and Aydin Bulu\u00e7. Integrated model and data parallelism in training neural networks. CoRR, abs\/1712.04432, 2017."},{"key":"e_1_3_2_2_41_1","first-page":"3983","volume-title":"Advances in Neural Information Processing Systems 32","author":"Addanki Ravichandra","year":"2019","unstructured":"Ravichandra Addanki , Shaileshh Bojja Venkatakrishnan , Shreyan Gupta , Hongzi Mao , and Mohammad Alizadeh . Learning generalizable device placement algorithms for distributed machine learning . In Advances in Neural Information Processing Systems 32 , pages 3983 -- 3993 . Curran Associates, Inc. , 2019 . Ravichandra Addanki, Shaileshh Bojja Venkatakrishnan, Shreyan Gupta, Hongzi Mao, and Mohammad Alizadeh. Learning generalizable device placement algorithms for distributed machine learning. In Advances in Neural Information Processing Systems 32, pages 3983--3993. Curran Associates, Inc., 2019."},{"key":"e_1_3_2_2_42_1","volume-title":"Paving Path For Advanced Conversational AI","author":"Narasimhan Shar","year":"2019","unstructured":"Shar Narasimhan . NVIDIA Clocks World's Fastest BERT Training Time and Largest Transformer Based Model , Paving Path For Advanced Conversational AI , Aug. 2019 . https:\/\/devblogs.nvidia.com\/training-bert-with-gpus\/. Shar Narasimhan. NVIDIA Clocks World's Fastest BERT Training Time and Largest Transformer Based Model, Paving Path For Advanced Conversational AI, Aug. 2019. https:\/\/devblogs.nvidia.com\/training-bert-with-gpus\/."},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00031"},{"key":"e_1_3_2_2_44_1","first-page":"1729","volume-title":"Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS'17","author":"Hoffer Elad","year":"2017","unstructured":"Elad Hoffer , Itay Hubara , and Daniel Soudry . Train longer, generalize better: Closing the generalization gap in large batch training of neural networks . In Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS'17 , pages 1729 -- 1739 , Red Hook, NY, USA , 2017 . Curran Associates Inc. Elad Hoffer, Itay Hubara, and Daniel Soudry. Train longer, generalize better: Closing the generalization gap in large batch training of neural networks. In Proceedings of the 31st International Conference on Neural Information Processing Systems, NIPS'17, pages 1729--1739, Red Hook, NY, USA, 2017. Curran Associates Inc."},{"key":"e_1_3_2_2_45_1","volume-title":"large minibatch SGD: training imagenet in 1 hour. CoRR, abs\/1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal , Piotr Doll\u00e1r , Ross B. Girshick , Pieter Noordhuis , Lukasz Wesolowski , Aapo Kyrola , Andrew Tulloch , Yangqing Jia , and Kaiming He. Accurate , large minibatch SGD: training imagenet in 1 hour. CoRR, abs\/1706.02677 , 2017 . Priya Goyal, Piotr Doll\u00e1r, Ross B. Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. Accurate, large minibatch SGD: training imagenet in 1 hour. CoRR, abs\/1706.02677, 2017."},{"issue":"112","key":"e_1_3_2_2_46_1","first-page":"1","article-title":"Measuring the effects of data parallelism on neural network training","volume":"20","author":"Shallue Christopher J.","year":"2019","unstructured":"Christopher J. Shallue , Jaehoon Lee , Joseph Antognini , Jascha Sohl-Dickstein , Roy Frostig , and George E. Dahl . Measuring the effects of data parallelism on neural network training . Journal of Machine Learning Research , 20 ( 112 ): 1 -- 49 , 2019 . Christopher J. Shallue, Jaehoon Lee, Joseph Antognini, Jascha Sohl-Dickstein, Roy Frostig, and George E. Dahl. Measuring the effects of data parallelism on neural network training. Journal of Machine Learning Research, 20(112):1--49, 2019.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_2_47_1","article-title":"The case for strong scaling in deep learning: Training large 3d cnns with hybrid parallelism","author":"Oyama Yosuke","year":"2020","unstructured":"Yosuke Oyama , Naoya Maruyama , Nikoli Dryden , Erin McCarthy , Peter Harrington , Jan Balewski , Satoshi Matsuoka , Peter Nugent , and Brian Van Essen . The case for strong scaling in deep learning: Training large 3d cnns with hybrid parallelism . IEEE Transactions on Parallel and Distributed Systems , 2020 . Yosuke Oyama, Naoya Maruyama, Nikoli Dryden, Erin McCarthy, Peter Harrington, Jan Balewski, Satoshi Matsuoka, Peter Nugent, and Brian Van Essen. The case for strong scaling in deep learning: Training large 3d cnns with hybrid parallelism. IEEE Transactions on Parallel and Distributed Systems, 2020.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_2_48_1","volume-title":"Aug.","author":"Perf","year":"2019","unstructured":"ML Perf v0.6 : NVIDIA Implementation of Attention Mechanisms for Translation , Aug. 2019 . https:\/\/github.com\/mlperf\/training_results_v0.6\/tree\/master\/NVIDIA\/benchmarks\/transformer\/implementations\/pytorch. MLPerf v0.6: NVIDIA Implementation of Attention Mechanisms for Translation, Aug. 2019. https:\/\/github.com\/mlperf\/training_results_v0.6\/tree\/master\/NVIDIA\/benchmarks\/transformer\/implementations\/pytorch."},{"key":"e_1_3_2_2_49_1","unstructured":"ResNet v1.5 for TensorFlow 2020.  ResNet v1.5 for TensorFlow 2020."},{"key":"e_1_3_2_2_50_1","unstructured":"NVIDIA Data Center Deep Learning Product Performance. https:\/\/developer.nvidia.com\/deep-learning-performance-training-inference.  NVIDIA Data Center Deep Learning Product Performance. https:\/\/developer.nvidia.com\/deep-learning-performance-training-inference."},{"key":"e_1_3_2_2_51_1","unstructured":"Nvidia DGX-2. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-2\/dgx-2-print-datasheet-738070-nvidia-a4-web-uk.pdf.  Nvidia DGX-2. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/dgx-2\/dgx-2-print-datasheet-738070-nvidia-a4-web-uk.pdf."},{"key":"e_1_3_2_2_52_1","volume-title":"Jul.","year":"2019","unstructured":"MegatronLM : Training Billion+ Parameter Language Models Using GPU Model Parallelism , Jul. 2019 . https:\/\/nv-adlr.github.io\/MegatronLM. MegatronLM: Training Billion+ Parameter Language Models Using GPU Model Parallelism, Jul. 2019. https:\/\/nv-adlr.github.io\/MegatronLM."},{"key":"e_1_3_2_2_53_1","volume-title":"Zero: Memory optimizations toward training trillion parameter models","author":"Rajbhandari Samyam","year":"2019","unstructured":"Samyam Rajbhandari , Jeff Rasley , Olatunji Ruwase , and Yuxiong He . Zero: Memory optimizations toward training trillion parameter models , 2019 . https:\/\/www.deepspeed.ai\/. Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. Zero: Memory optimizations toward training trillion parameter models, 2019. https:\/\/www.deepspeed.ai\/."},{"key":"e_1_3_2_2_54_1","volume-title":"Efficient communication acceleration for next-gen scale-up deep learning training platforms","author":"Rashidi Saeed","year":"2020","unstructured":"Saeed Rashidi , Srinivas Sridharan , Sudarshan Srinivasan , Matthew Denton , and Tushar Krishna . Efficient communication acceleration for next-gen scale-up deep learning training platforms , 2020 . Saeed Rashidi, Srinivas Sridharan, Sudarshan Srinivasan, Matthew Denton, and Tushar Krishna. Efficient communication acceleration for next-gen scale-up deep learning training platforms, 2020."},{"key":"e_1_3_2_2_55_1","volume-title":"Hugh Williams. Sirius: A Flat Datacenter Network with Nanosecond Optical Switching. SIGCOMM'20","author":"Ballani Hitesh","year":"2020","unstructured":"Hitesh Ballani , Paolo Costa , Raphael Behrendt , Daniel Cletheroe , Istvan Haller , Krzysztof Jozwik , Fotini Karinou , Sophie Lange , Kai Shi , Benn Thomsen , and Hugh Williams. Sirius: A Flat Datacenter Network with Nanosecond Optical Switching. SIGCOMM'20 , Aug. 2020 . Hitesh Ballani, Paolo Costa, Raphael Behrendt, Daniel Cletheroe, Istvan Haller, Krzysztof Jozwik, Fotini Karinou, Sophie Lange, Kai Shi, Benn Thomsen, and Hugh Williams. Sirius: A Flat Datacenter Network with Nanosecond Optical Switching. SIGCOMM'20, Aug. 2020."},{"key":"e_1_3_2_2_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/2000064.2000108"},{"key":"e_1_3_2_2_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2013.7478302"},{"key":"e_1_3_2_2_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/JLT.2015.2391301"},{"key":"e_1_3_2_2_59_1","unstructured":"Alexis Bj\u00f6rlin and Manish Mehta. Broadcom discusses its co-packaged optics plans. http:\/\/www.gazettabyte.com\/home\/2021\/4\/27\/broadcom-discusses-its-co-packaged-optics-plans.html 2021. [Online; last accessed 25-June-2021].  Alexis Bj\u00f6rlin and Manish Mehta. Broadcom discusses its co-packaged optics plans. http:\/\/www.gazettabyte.com\/home\/2021\/4\/27\/broadcom-discusses-its-co-packaged-optics-plans.html 2021. [Online; last accessed 25-June-2021]."},{"key":"e_1_3_2_2_60_1","volume-title":"Mar.","author":"Leibson Steven","year":"2020","unstructured":"Steven Leibson . Ayar labs and Intel demo FPGA with optical transceivers in DARPA PIPES project: 2 Tbps now, &gt;100 Tbps is the goal , Mar. 2020 . https:\/\/blogs.intel.com\/psg\/ayar-labs-and-intel-demo-fpga-with-optical-transceivers-in-darpa-pipes-project-2-tbps-now-100-tbps-is-the-goal\/. Steven Leibson. Ayar labs and Intel demo FPGA with optical transceivers in DARPA PIPES project: 2 Tbps now, &gt;100 Tbps is the goal, Mar. 2020. https:\/\/blogs.intel.com\/psg\/ayar-labs-and-intel-demo-fpga-with-optical-transceivers-in-darpa-pipes-project-2-tbps-now-100-tbps-is-the-goal\/."},{"key":"e_1_3_2_2_61_1","unstructured":"Pipes researchers demonstrate optical interconnects to improve performance of digital microelectronics Mar. 2020. https:\/\/www.darpa.mil\/news-events\/2020-03-25.  Pipes researchers demonstrate optical interconnects to improve performance of digital microelectronics Mar. 2020. https:\/\/www.darpa.mil\/news-events\/2020-03-25."},{"key":"e_1_3_2_2_62_1","volume-title":"Aug.","author":"Trader Tiffany","year":"2019","unstructured":"Tiffany Trader . Ayar Labs to Demo Photonics Chiplet in FPGA Package at Hot Chips , Aug. 2019 . https:\/\/www.hpcwire.com\/2019\/08\/19\/ayar-labs-to-demo-photonics-chiplet-in-fpga-package-at-hot-chips\/. Tiffany Trader. Ayar Labs to Demo Photonics Chiplet in FPGA Package at Hot Chips, Aug. 2019. https:\/\/www.hpcwire.com\/2019\/08\/19\/ayar-labs-to-demo-photonics-chiplet-in-fpga-package-at-hot-chips\/."},{"key":"e_1_3_2_2_63_1","volume-title":"Fleet---fast lanes for expedited execution at 10 terabits: Program overview","author":"Douglis F.","unstructured":"F. Douglis , S. Robertson , E. Van den Berg , J. Micallef , M. Pucci , A. Aiken , M. Hattink , M. Seok , and K. Bergman . Fleet---fast lanes for expedited execution at 10 terabits: Program overview . IEEE Internet Computing , (01):1--1, apr 5555. F. Douglis, S. Robertson, E. Van den Berg, J. Micallef, M. Pucci, A. Aiken, M. Hattink, M. Seok, and K. Bergman. Fleet---fast lanes for expedited execution at 10 terabits: Program overview. IEEE Internet Computing, (01):1--1, apr 5555."},{"key":"e_1_3_2_2_64_1","unstructured":"Ayar Labs TeraPHY Silicon Chip. https:\/\/ayarlabs.com\/products\/.  Ayar Labs TeraPHY Silicon Chip. https:\/\/ayarlabs.com\/products\/."},{"key":"e_1_3_2_2_65_1","unstructured":"Demonstration of Ayar Labs' Optical I\/O Multi-Chip Package and Single-Die Package solutions Aug. 2020. https:\/\/vimeo.com\/449164007.  Demonstration of Ayar Labs' Optical I\/O Multi-Chip Package and Single-Die Package solutions Aug. 2020. https:\/\/vimeo.com\/449164007."},{"key":"e_1_3_2_2_66_1","doi-asserted-by":"publisher","DOI":"10.1145\/3098822.3098838"},{"key":"e_1_3_2_2_67_1","doi-asserted-by":"publisher","DOI":"10.1364\/OPTICA.3.000064"},{"key":"e_1_3_2_2_68_1","doi-asserted-by":"publisher","DOI":"10.1364\/CLEO_SI.2018.SF1A.4"},{"key":"e_1_3_2_2_69_1","volume-title":"NSDI'20","author":"Mellette William M.","year":"2020","unstructured":"William M. Mellette , Rajdeep Das , Yibo Guo , Rob McGuinness , Alex C. Snoeren , and George Porter . Expanding across time to deliver bandwidth efficiency and low latency . NSDI'20 , 2020 . William M. Mellette, Rajdeep Das, Yibo Guo, Rob McGuinness, Alex C. Snoeren, and George Porter. Expanding across time to deliver bandwidth efficiency and low latency. NSDI'20, 2020."},{"key":"e_1_3_2_2_70_1","first-page":"283","volume-title":"SIGCOMM'14","author":"Liu Yunpeng James","unstructured":"Yunpeng James Liu , Peter Xiang Gao , Bernard Wong , and Srinivasan Keshav. Quartz : A new design element for low-latency dcns . SIGCOMM'14 , pages 283 -- 294 . Yunpeng James Liu, Peter Xiang Gao, Bernard Wong, and Srinivasan Keshav. Quartz: A new design element for low-latency dcns. SIGCOMM'14, pages 283--294."},{"key":"e_1_3_2_2_71_1","first-page":"447","volume-title":"SIGCOMM'13","author":"Porter George","unstructured":"George Porter , Richard Strong , Nathan Farrington , Alex Forencich , Pang Chen-Sun , Tajana Rosing , Yeshaiahu Fainman , George Papen , and Amin Vahdat . Integrating microsecond circuit switching into the data center . SIGCOMM'13 , pages 447 -- 458 . George Porter, Richard Strong, Nathan Farrington, Alex Forencich, Pang Chen-Sun, Tajana Rosing, Yeshaiahu Fainman, George Papen, and Amin Vahdat. Integrating microsecond circuit switching into the data center. SIGCOMM'13, pages 447--458."},{"key":"e_1_3_2_2_72_1","volume-title":"HotNets-XIII Proceedings of the 13th ACM Workshop on Hot Topics in Networks. ACM","author":"Padhye Jitu","year":"2014","unstructured":"meg walraed sullivan, Jitu Padhye , and Dave Maltz . Theia : Simple and cheap networking for ultra-dense data centers . In HotNets-XIII Proceedings of the 13th ACM Workshop on Hot Topics in Networks. ACM , October 2014 . meg walraed sullivan, Jitu Padhye, and Dave Maltz. Theia: Simple and cheap networking for ultra-dense data centers. In HotNets-XIII Proceedings of the 13th ACM Workshop on Hot Topics in Networks. ACM, October 2014."},{"key":"e_1_3_2_2_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/2462902.2462917"},{"key":"e_1_3_2_2_74_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851191"},{"key":"e_1_3_2_2_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.113081"},{"key":"e_1_3_2_2_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/1394608.1382129"},{"key":"e_1_3_2_2_77_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-67630-2_5"},{"key":"e_1_3_2_2_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2008.19"},{"key":"e_1_3_2_2_79_1","unstructured":"Calient Optical Circuit Switch. https:\/\/www.calient.net\/products\/edge640-optical-circuit-switch\/.  Calient Optical Circuit Switch. https:\/\/www.calient.net\/products\/edge640-optical-circuit-switch\/."},{"key":"e_1_3_2_2_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/1402946.1402967"},{"key":"e_1_3_2_2_81_1","first-page":"74","volume-title":"Proceedings of the ACM SIGCOMM 2009 Conference on Data Communication, SIGCOMM '09","author":"Guo Chuanxiong","year":"2009","unstructured":"Chuanxiong Guo , Guohan Lu , Dan Li , Haitao Wu , Xuan Zhang , Yunfeng Shi , Chen Tian , Yongguang Zhang , and Songwu Lu. Bcube : A high performance, server-centric network architecture for modular data centers . In Proceedings of the ACM SIGCOMM 2009 Conference on Data Communication, SIGCOMM '09 , page 63? 74 , New York, NY, USA , 2009 . Association for Computing Machinery. Chuanxiong Guo, Guohan Lu, Dan Li, Haitao Wu, Xuan Zhang, Yunfeng Shi, Chen Tian, Yongguang Zhang, and Songwu Lu. Bcube: A high performance, server-centric network architecture for modular data centers. In Proceedings of the ACM SIGCOMM 2009 Conference on Data Communication, SIGCOMM '09, page 63?74, New York, NY, USA, 2009. Association for Computing Machinery."},{"key":"e_1_3_2_2_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.34"},{"key":"e_1_3_2_2_83_1","volume-title":"Mark Hummel, and John Schafer. NVIDIA's NVLink-Switching Chip and Scale-Up GPU-Compute Server. HotChips","author":"Ishii Alexander","year":"2018","unstructured":"Alexander Ishii , Denis Foley , Eric Anderson , Bill Dally , Glenn Dearth Larry Dennison , Mark Hummel, and John Schafer. NVIDIA's NVLink-Switching Chip and Scale-Up GPU-Compute Server. HotChips , 2018 . https:\/\/www.hotchips.org\/hc30\/2conf\/2.01_Nvidia_NVswitch_HotChips2018_DGX2NVS_Final.pdf. Alexander Ishii, Denis Foley, Eric Anderson, Bill Dally, Glenn Dearth Larry Dennison, Mark Hummel, and John Schafer. NVIDIA's NVLink-Switching Chip and Scale-Up GPU-Compute Server. HotChips, 2018. https:\/\/www.hotchips.org\/hc30\/2conf\/2.01_Nvidia_NVswitch_HotChips2018_DGX2NVS_Final.pdf."},{"key":"e_1_3_2_2_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_2_85_1","unstructured":"Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. Improving Language Understanding by Generative Pre-Training.  Alec Radford Karthik Narasimhan Tim Salimans and Ilya Sutskever. Improving Language Understanding by Generative Pre-Training."},{"key":"e_1_3_2_2_86_1","volume-title":"Measuring the effects of data parallelism on neural network training. CoRR, abs\/1811.03600","author":"Shallue Christopher J.","year":"2018","unstructured":"Christopher J. Shallue , Jaehoon Lee , Joseph M. Antognini , Jascha Sohl-Dickstein , Roy Frostig , and George E. Dahl . Measuring the effects of data parallelism on neural network training. CoRR, abs\/1811.03600 , 2018 . Christopher J. Shallue, Jaehoon Lee, Joseph M. Antognini, Jascha Sohl-Dickstein, Roy Frostig, and George E. Dahl. Measuring the effects of data parallelism on neural network training. CoRR, abs\/1811.03600, 2018."},{"key":"e_1_3_2_2_87_1","volume-title":"Aug.","author":"Puri Raul","year":"2019","unstructured":"Raul Puri . Megatron: a large, powerful transformer , Aug. 2019 . https:\/\/github.com\/NVIDIA\/Megatron-LM. Raul Puri. Megatron: a large, powerful transformer, Aug. 2019. https:\/\/github.com\/NVIDIA\/Megatron-LM."},{"key":"e_1_3_2_2_88_1","unstructured":"MLPerf: A broad ML benchmark suite. https:\/\/mlperf.org\/.  MLPerf: A broad ML benchmark suite. https:\/\/mlperf.org\/."},{"key":"e_1_3_2_2_89_1","unstructured":"FlexFlow Github. https:\/\/github.com\/flexflow\/FlexFlow.git.  FlexFlow Github. https:\/\/github.com\/flexflow\/FlexFlow.git."},{"key":"e_1_3_2_2_90_1","volume-title":"Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325","author":"Akiba Takuya","year":"2017","unstructured":"Takuya Akiba , Shuji Suzuki , and Keisuke Fukuda . Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325 , 2017 . Takuya Akiba, Shuji Suzuki, and Keisuke Fukuda. Extremely large minibatch sgd: Training resnet-50 on imagenet in 15 minutes. arXiv preprint arXiv:1711.04325, 2017."},{"key":"e_1_3_2_2_91_1","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225069"},{"key":"e_1_3_2_2_92_1","volume-title":"Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv preprint arXiv:1807.11205","author":"Jia Xianyan","year":"2018","unstructured":"Xianyan Jia , Shutao Song , Wei He , Yangzihao Wang , Haidong Rong , Feihu Zhou , Liqiang Xie , Zhenyu Guo , Yuanzhou Yang , Liwei Yu , Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv preprint arXiv:1807.11205 , 2018 . Xianyan Jia, Shutao Song, Wei He, Yangzihao Wang, Haidong Rong, Feihu Zhou, Liqiang Xie, Zhenyu Guo, Yuanzhou Yang, Liwei Yu, et al. Highly scalable deep learning training system with mixed-precision: Training imagenet in four minutes. arXiv preprint arXiv:1807.11205, 2018."},{"key":"e_1_3_2_2_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/1851182.1851192"},{"key":"e_1_3_2_2_94_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341302.3342085"},{"key":"e_1_3_2_2_95_1","doi-asserted-by":"publisher","DOI":"10.1364\/OFC.2019.M4D.7"},{"key":"e_1_3_2_2_96_1","first-page":"05","article-title":"Silicon photonics wdm transceiver with soa and semiconductor mode-locked laser","volume":"7","author":"Alvaro","year":"2016","unstructured":"Alvaro moscoso martir, Juliana M\u00fcller , Johannes Hauck , Nicolas Chimot , Rony Setter , Avner Badihi , Daniel Rasmussen , Alexandre Garreau , Mads Nielsen , Elmira Islamova , Sebastian Romero-Garc\u00eda , Bin Shen , Anna Sandomirsky , Sylvie Rockman , Chao Li , Saeed Sharif Azadeh , Guo-Qiang Lo , Elad Mentovich , Florian Merget , and Jeremy Witzens . Silicon photonics wdm transceiver with soa and semiconductor mode-locked laser . Scientific Reports , 7 , 05 2016 . Alvaro moscoso martir, Juliana M\u00fcller, Johannes Hauck, Nicolas Chimot, Rony Setter, Avner Badihi, Daniel Rasmussen, Alexandre Garreau, Mads Nielsen, Elmira Islamova, Sebastian Romero-Garc\u00eda, Bin Shen, Anna Sandomirsky, Sylvie Rockman, Chao Li, Saeed Sharif Azadeh, Guo-Qiang Lo, Elad Mentovich, Florian Merget, and Jeremy Witzens. Silicon photonics wdm transceiver with soa and semiconductor mode-locked laser. Scientific Reports, 7, 05 2016.","journal-title":"Scientific Reports"},{"key":"e_1_3_2_2_97_1","unstructured":"2020 General Europractice Pricelist Jan. 2020. https:\/\/europractice-ic.com\/wp-content\/uploads\/2020\/01\/General-MPW-EUROPRACTICE-200123-v3.pdf.  2020 General Europractice Pricelist Jan. 2020. https:\/\/europractice-ic.com\/wp-content\/uploads\/2020\/01\/General-MPW-EUROPRACTICE-200123-v3.pdf."},{"key":"e_1_3_2_2_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/EPTC.2017.8277464"},{"key":"e_1_3_2_2_99_1","doi-asserted-by":"publisher","DOI":"10.1038\/nature16454"},{"key":"e_1_3_2_2_100_1","first-page":"1223","volume-title":"Advances in neural information processing systems","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean , Greg Corrado , Rajat Monga , Kai Chen , Matthieu Devin , Mark Mao , Andrew Senior , Paul Tucker , Ke Yang , Quoc V Le , Large scale distributed deep networks . In Advances in neural information processing systems , pages 1223 -- 1231 , 2012 . Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Andrew Senior, Paul Tucker, Ke Yang, Quoc V Le, et al. Large scale distributed deep networks. In Advances in neural information processing systems, pages 1223--1231, 2012."},{"key":"e_1_3_2_2_101_1","first-page":"1337","volume-title":"International conference on machine learning","author":"Coates Adam","year":"2013","unstructured":"Adam Coates , Brody Huval , Tao Wang , David Wu , Bryan Catanzaro , and Ng Andrew . Deep learning with cots hpc systems . In International conference on machine learning , pages 1337 -- 1345 , 2013 . Adam Coates, Brody Huval, Tao Wang, David Wu, Bryan Catanzaro, and Ng Andrew. Deep learning with cots hpc systems. In International conference on machine learning, pages 1337--1345, 2013."},{"key":"e_1_3_2_2_102_1","first-page":"571","volume-title":"OSDI'14","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi , Yutaka Suzue , Johnson Apacible , and Karthik Kalyanaraman . Project adam : Building an efficient and scalable deep learning training system . In OSDI'14 , pages 571 -- 582 , 2014 . Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. Project adam: Building an efficient and scalable deep learning training system. In OSDI'14, pages 571--582, 2014."},{"key":"e_1_3_2_2_103_1","first-page":"485","volume-title":"NSDI'19","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu , Mosharaf Chowdhury , Kang G Shin , Yibo Zhu , Myeongjae Jeon , Junjie Qian , Hongqiang Liu , and Chuanxiong Guo . Tiresias : A {GPU} cluster manager for distributed deep learning . In NSDI'19 , pages 485 -- 500 , 2019 . Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. Tiresias: A {GPU} cluster manager for distributed deep learning. In NSDI'19, pages 485--500, 2019."},{"key":"e_1_3_2_2_104_1","volume-title":"Optimizing network performance for distributed dnn training on gpu clusters: Imagenet\/alexnet training in 1.5 minutes. arXiv preprint arXiv:1902.06855","author":"Sun Peng","year":"2019","unstructured":"Peng Sun , Wansen Feng , Ruobing Han , Shengen Yan , and Yonggang Wen . Optimizing network performance for distributed dnn training on gpu clusters: Imagenet\/alexnet training in 1.5 minutes. arXiv preprint arXiv:1902.06855 , 2019 . Peng Sun, Wansen Feng, Ruobing Han, Shengen Yan, and Yonggang Wen. Optimizing network performance for distributed dnn training on gpu clusters: Imagenet\/alexnet training in 1.5 minutes. arXiv preprint arXiv:1902.06855, 2019."},{"key":"e_1_3_2_2_105_1","volume-title":"Pytorch-biggraph: A large-scale graph embedding system. CoRR, abs\/1903.12287","author":"Lerer Adam","year":"2019","unstructured":"Adam Lerer , Ledell Wu , Jiajun Shen , Timoth\u00e9e Lacroix , Luca Wehrstedt , Abhijit Bose , and Alexander Peysakhovich . Pytorch-biggraph: A large-scale graph embedding system. CoRR, abs\/1903.12287 , 2019 . Adam Lerer, Ledell Wu, Jiajun Shen, Timoth\u00e9e Lacroix, Luca Wehrstedt, Abhijit Bose, and Alexander Peysakhovich. Pytorch-biggraph: A large-scale graph embedding system. CoRR, abs\/1903.12287, 2019."},{"key":"e_1_3_2_2_106_1","volume-title":"7th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 15)","author":"Mai Luo","year":"2015","unstructured":"Luo Mai , Chuntao Hong , and Paolo Costa . Optimizing network performance in distributed machine learning . In 7th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 15) , Santa Clara, CA , 2015 . USENIX Association. Luo Mai, Chuntao Hong, and Paolo Costa. Optimizing network performance in distributed machine learning. In 7th USENIX Workshop on Hot Topics in Cloud Computing (HotCloud 15), Santa Clara, CA, 2015. USENIX Association."},{"key":"e_1_3_2_2_107_1","volume-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887","author":"Lin Yujun","year":"2017","unstructured":"Yujun Lin , Song Han , Huizi Mao , Yu Wang , and William J Dally . Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887 , 2017 . Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J Dally. Deep gradient compression: Reducing the communication bandwidth for distributed training. arXiv preprint arXiv:1712.01887, 2017."},{"key":"e_1_3_2_2_108_1","first-page":"1710","volume-title":"QSGD: Communication-efficient SGD via randomized quantization and encoding","author":"Alistarh Dan","year":"2018","unstructured":"Dan Alistarh , Demjan Grubic , Jerry Li , Ryota Tomioka , and Milan Vojnovic . QSGD: Communication-efficient SGD via randomized quantization and encoding . volume 3 , pages 1710 -- 1721 , 2018 . Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. QSGD: Communication-efficient SGD via randomized quantization and encoding. volume 3, pages 1710 -- 1721, 2018."},{"key":"e_1_3_2_2_109_1","volume-title":"3lc: Lightweight and effective traffic compression for distributed machine learning. arXiv preprint arXiv:1802.07389","author":"Lim Hyeontaek","year":"2018","unstructured":"Hyeontaek Lim , David G Andersen , and Michael Kaminsky . 3lc: Lightweight and effective traffic compression for distributed machine learning. arXiv preprint arXiv:1802.07389 , 2018 . Hyeontaek Lim, David G Andersen, and Michael Kaminsky. 3lc: Lightweight and effective traffic compression for distributed machine learning. arXiv preprint arXiv:1802.07389, 2018."},{"key":"e_1_3_2_2_110_1","first-page":"2525","volume-title":"Advances in Neural Information Processing Systems 31","author":"Jiang Peng","year":"2018","unstructured":"Peng Jiang and Gagan Agrawal . A linear speedup analysis of distributed deep learning with sparse and quantized communication. In S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett, editors , Advances in Neural Information Processing Systems 31 , pages 2525 -- 2536 . Curran Associates, Inc. , 2018 . Peng Jiang and Gagan Agrawal. A linear speedup analysis of distributed deep learning with sparse and quantized communication. In S. Bengio, H. Wallach, H. Larochelle, K. Grauman, N. Cesa-Bianchi, and R. Garnett, editors, Advances in Neural Information Processing Systems 31, pages 2525--2536. Curran Associates, Inc., 2018."},{"key":"e_1_3_2_2_111_1","volume-title":"One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997","author":"Krizhevsky Alex","year":"2014","unstructured":"Alex Krizhevsky . One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997 , 2014 . Alex Krizhevsky. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997, 2014."},{"key":"e_1_3_2_2_112_1","first-page":"685","volume-title":"Advances in Neural Information Processing Systems","author":"Zhang Sixin","year":"2015","unstructured":"Sixin Zhang , Anna E Choromanska , and Yann LeCun . Deep learning with elastic averaging sgd . In Advances in Neural Information Processing Systems , pages 685 -- 693 , 2015 . Sixin Zhang, Anna E Choromanska, and Yann LeCun. Deep learning with elastic averaging sgd. In Advances in Neural Information Processing Systems, pages 685--693, 2015."},{"key":"e_1_3_2_2_113_1","first-page":"873","volume-title":"Advances in Neural Information Processing Systems","author":"Agarwal Alekh","year":"2011","unstructured":"Alekh Agarwal and John C Duchi . Distributed delayed stochastic optimization . In Advances in Neural Information Processing Systems , pages 873 -- 881 , 2011 . Alekh Agarwal and John C Duchi. Distributed delayed stochastic optimization. In Advances in Neural Information Processing Systems, pages 873--881, 2011."},{"key":"e_1_3_2_2_114_1","first-page":"693","volume-title":"Proceedings of the 24th International Conference on Neural Information Processing Systems, NIPS'11","author":"Niu Feng","year":"2011","unstructured":"Feng Niu , Benjamin Recht , Christopher Re , and Stephen J. Wright . Hogwild!: A lock-free approach to parallelizing stochastic gradient descent . In Proceedings of the 24th International Conference on Neural Information Processing Systems, NIPS'11 , pages 693 -- 701 , 2011 . Feng Niu, Benjamin Recht, Christopher Re, and Stephen J. Wright. Hogwild!: A lock-free approach to parallelizing stochastic gradient descent. In Proceedings of the 24th International Conference on Neural Information Processing Systems, NIPS'11, pages 693--701, 2011."},{"key":"e_1_3_2_2_115_1","doi-asserted-by":"publisher","DOI":"10.1145\/2987550.2987586"},{"key":"e_1_3_2_2_116_1","volume-title":"Sangeetha Abdu Jyothi, and Roy H. Campbell. Communication scheduling as a first-class citizen in distributed machine learning systems. CoRR, abs\/1803.03288","author":"Hashemi Sayed Hadi","year":"2018","unstructured":"Sayed Hadi Hashemi , Sangeetha Abdu Jyothi, and Roy H. Campbell. Communication scheduling as a first-class citizen in distributed machine learning systems. CoRR, abs\/1803.03288 , 2018 . Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy H. Campbell. Communication scheduling as a first-class citizen in distributed machine learning systems. CoRR, abs\/1803.03288, 2018."},{"key":"e_1_3_2_2_117_1","volume-title":"Firecaffe: near-linear acceleration of deep neural network training on compute clusters. CoRR, abs\/1511.00175","author":"Iandola Forrest N.","year":"2015","unstructured":"Forrest N. Iandola , Khalid Ashraf , Matthew W. Moskewicz , and Kurt Keutzer . Firecaffe: near-linear acceleration of deep neural network training on compute clusters. CoRR, abs\/1511.00175 , 2015 . Forrest N. Iandola, Khalid Ashraf, Matthew W. Moskewicz, and Kurt Keutzer. Firecaffe: near-linear acceleration of deep neural network training on compute clusters. CoRR, abs\/1511.00175, 2015."},{"key":"e_1_3_2_2_118_1","volume-title":"Hot Chips","volume":"29","author":"Chung Eric","year":"2017","unstructured":"Eric Chung , Jeremy Fowers , Kalin Ovtcharov , Michael Papamichael , Adrian Caulfield , Todd Massengil , Ming Liu , Daniel Lo , Shlomi Alkalay , and Michael Haselman . Accelerating persistent neural networks at datacenter scale . In Hot Chips , volume 29 , 2017 . Eric Chung, Jeremy Fowers, Kalin Ovtcharov, Michael Papamichael, Adrian Caulfield, Todd Massengil, Ming Liu, Daniel Lo, Shlomi Alkalay, and Michael Haselman. Accelerating persistent neural networks at datacenter scale. In Hot Chips, volume 29, 2017."},{"key":"e_1_3_2_2_119_1","volume-title":"Priority-based parameter propagation for distributed DNN training. CoRR, abs\/1905.03960","author":"Jayarajan Anand","year":"2019","unstructured":"Anand Jayarajan , Jinliang Wei , Garth Gibson , Alexandra Fedorova , and Gennady Pekhimenko . Priority-based parameter propagation for distributed DNN training. CoRR, abs\/1905.03960 , 2019 . Anand Jayarajan, Jinliang Wei, Garth Gibson, Alexandra Fedorova, and Gennady Pekhimenko. Priority-based parameter propagation for distributed DNN training. CoRR, abs\/1905.03960, 2019."},{"key":"e_1_3_2_2_120_1","volume-title":"Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso . Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799 , 2018 . Alexander Sergeev and Mike Del Balso. Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799, 2018."},{"key":"e_1_3_2_2_121_1","doi-asserted-by":"publisher","DOI":"10.5555\/3305890.3305932"},{"key":"e_1_3_2_2_122_1","first-page":"41","volume-title":"Zenglin Xu, and Tim Kraska. Superneurons: dynamic gpu memory management for training deep neural networks. In ACM SIGPLAN Notices","author":"Wang Linnan","year":"2018","unstructured":"Linnan Wang , Jinmian Ye , Yiyang Zhao , Wei Wu , Ang Li , Shuaiwen Leon Song , Zenglin Xu, and Tim Kraska. Superneurons: dynamic gpu memory management for training deep neural networks. In ACM SIGPLAN Notices , volume 53 , pages 41 -- 53 . ACM , 2018 . Linnan Wang, Jinmian Ye, Yiyang Zhao, Wei Wu, Ang Li, Shuaiwen Leon Song, Zenglin Xu, and Tim Kraska. Superneurons: dynamic gpu memory management for training deep neural networks. In ACM SIGPLAN Notices, volume 53, pages 41--53. ACM, 2018."},{"key":"e_1_3_2_2_123_1","volume-title":"NeurIPS","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang , Yonglong Cheng , Dehao Chen , HyoukJoong Lee , Jiquan Ngiam , Quoc V. Le , and Zhifeng Chen . Gpipe : Efficient training of giant neural networks using pipeline parallelism . NeurIPS , 2019 . Yanping Huang, Yonglong Cheng, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V. Le, and Zhifeng Chen. Gpipe: Efficient training of giant neural networks using pipeline parallelism. NeurIPS, 2019."},{"key":"e_1_3_2_2_124_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"e_1_3_2_2_125_1","doi-asserted-by":"publisher","DOI":"10.1038\/nphoton.2017.93"},{"key":"e_1_3_2_2_126_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2016.7446049"},{"key":"e_1_3_2_2_127_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2016.2587683"},{"key":"e_1_3_2_2_128_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_2_129_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2011.89"},{"key":"e_1_3_2_2_130_1","first-page":"19","volume-title":"Proceedings of the 7th USENIX Conference on Networked Systems Design and Implementation, NSDI'10","author":"Al-Fares Mohammad","year":"2010","unstructured":"Mohammad Al-Fares , Sivasankar Radhakrishnan , Barath Raghavan , Nelson Huang , and Amin Vahdat . Hedera : Dynamic flow scheduling for data center networks . In Proceedings of the 7th USENIX Conference on Networked Systems Design and Implementation, NSDI'10 , pages 19 -- 19 , Berkeley, CA, USA , 2010 . USENIX Association. Mohammad Al-Fares, Sivasankar Radhakrishnan, Barath Raghavan, Nelson Huang, and Amin Vahdat. Hedera: Dynamic flow scheduling for data center networks. In Proceedings of the 7th USENIX Conference on Networked Systems Design and Implementation, NSDI'10, pages 19--19, Berkeley, CA, USA, 2010. USENIX Association."},{"key":"e_1_3_2_2_131_1","first-page":"319","volume-title":"SIGCOMM'14","author":"Hamedazimi Navid","unstructured":"Navid Hamedazimi , Zafar Qazi , Himanshu Gupta , Vyas Sekar , Samir R. Das , Jon P. Longtin , Himanshu Shah , and Ashish Tanwer . Firefly : A reconfigurable wireless data center fabric using free-space optics . SIGCOMM'14 , pages 319 -- 330 . Navid Hamedazimi, Zafar Qazi, Himanshu Gupta, Vyas Sekar, Samir R. Das, Jon P. Longtin, Himanshu Shah, and Ashish Tanwer. Firefly: A reconfigurable wireless data center fabric using free-space optics. SIGCOMM'14, pages 319--330."},{"key":"e_1_3_2_2_132_1","doi-asserted-by":"publisher","DOI":"10.1145\/2934872.2934911"},{"key":"e_1_3_2_2_133_1","doi-asserted-by":"publisher","DOI":"10.1145\/2716281.2836126"},{"key":"e_1_3_2_2_134_1","first-page":"239","volume-title":"Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12)","author":"Singla Ankit","year":"2012","unstructured":"Ankit Singla , Atul Singh , and Yan Chen . OSA: An optical switching architecture for data center networks with unprecedented flexibility . In Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12) , pages 239 -- 252 , San Jose, CA , 2012 . USENIX. Ankit Singla, Atul Singh, and Yan Chen. OSA: An optical switching architecture for data center networks with unprecedented flexibility. In Presented as part of the 9th USENIX Symposium on Networked Systems Design and Implementation (NSDI 12), pages 239--252, San Jose, CA, 2012. USENIX."},{"key":"e_1_3_2_2_135_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI'19)","author":"Shrivastav Vishal","year":"2019","unstructured":"Vishal Shrivastav , Asaf Valadarsky , Hitesh Ballani , Paolo Costa , Ki Suh Lee , Han Wang , Rachit Agarwal , and Hakim Weatherspoon . Shoal : A network architecture for disaggregated racks . In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI'19) . USENIX, February 2019 . Vishal Shrivastav, Asaf Valadarsky, Hitesh Ballani, Paolo Costa, Ki Suh Lee, Han Wang, Rachit Agarwal, and Hakim Weatherspoon. Shoal: A network architecture for disaggregated racks. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI'19). USENIX, February 2019."},{"key":"e_1_3_2_2_136_1","first-page":"1","volume-title":"NSDI'14","author":"Liu He","unstructured":"He Liu , Feng Lu , Alex Forencich , Rishi Kapoor , Malveeka Tewari , Geoffrey M. Voelker , George Papen , Alex C. Snoeren , and George Porter . Circuit switching under the radar with REACToR . NSDI'14 , pages 1 -- 15 . He Liu, Feng Lu, Alex Forencich, Rishi Kapoor, Malveeka Tewari, Geoffrey M. Voelker, George Papen, Alex C. Snoeren, and George Porter. Circuit switching under the radar with REACToR. NSDI'14, pages 1--15."},{"key":"e_1_3_2_2_137_1","first-page":"17","volume-title":"Proceedings of the 9th USENIX Conference on Networked Systems Design and Implementation, NSDI'12","author":"Singla Ankit","year":"2012","unstructured":"Ankit Singla , Chi-Yao Hong , Lucian Popa , and P. Brighten Godfrey . Jellyfish: Networking data centers randomly . In Proceedings of the 9th USENIX Conference on Networked Systems Design and Implementation, NSDI'12 , pages 17 -- 17 , Berkeley, CA, USA , 2012 . USENIX Association. Ankit Singla, Chi-Yao Hong, Lucian Popa, and P. Brighten Godfrey. Jellyfish: Networking data centers randomly. In Proceedings of the 9th USENIX Conference on Networked Systems Design and Implementation, NSDI'12, pages 17--17, Berkeley, CA, USA, 2012. USENIX Association."},{"key":"e_1_3_2_2_138_1","first-page":"141","volume-title":"15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18)","author":"Chatzieleftheriou Andromachi","year":"2018","unstructured":"Andromachi Chatzieleftheriou , Sergey Legtchenko , Hugh Williams , and Antony Rowstron . Larry : Practical network reconfigurability in the data center . In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18) , pages 141 -- 156 , Renton, WA , April 2018 . USENIX Association. Andromachi Chatzieleftheriou, Sergey Legtchenko, Hugh Williams, and Antony Rowstron. Larry: Practical network reconfigurability in the data center. In 15th USENIX Symposium on Networked Systems Design and Implementation (NSDI 18), pages 141--156, Renton, WA, April 2018. USENIX Association."},{"key":"e_1_3_2_2_139_1","first-page":"15","volume-title":"13th USENIX Symposium on Networked Systems Design and Implementation (NSDI 16)","author":"Legtchenko Sergey","year":"2016","unstructured":"Sergey Legtchenko , Nicholas Chen , Daniel Cletheroe , Antony Rowstron , Hugh Williams , and Xiaohan Zhao . Xfabric : A reconfigurable in-rack network for rack-scale computers . In 13th USENIX Symposium on Networked Systems Design and Implementation (NSDI 16) , pages 15 -- 29 , Santa Clara, CA , March 2016 . USENIX Association. Sergey Legtchenko, Nicholas Chen, Daniel Cletheroe, Antony Rowstron, Hugh Williams, and Xiaohan Zhao. Xfabric: A reconfigurable in-rack network for rack-scale computers. In 13th USENIX Symposium on Networked Systems Design and Implementation (NSDI 16), pages 15--29, Santa Clara, CA, March 2016. USENIX Association."},{"issue":"65","key":"e_1_3_2_2_140_1","first-page":"80","article-title":"Optical interconnects for extreme scale computing systems","volume":"64","author":"Rumley Sebastien","year":"2017","unstructured":"Sebastien Rumley , Meisam Bahadori , Robert Polster , Simon D. Hammond , David M. Calhoun , Ke Wen , Arun Rodrigues , and Keren Bergman . Optical interconnects for extreme scale computing systems . Parallel Computing , 64 : 65 -- 80 , 2017 . High-End Computing for Next-Generation Scientific Discovery. Sebastien Rumley, Meisam Bahadori, Robert Polster, Simon D. Hammond, David M. Calhoun, Ke Wen, Arun Rodrigues, and Keren Bergman. Optical interconnects for extreme scale computing systems. Parallel Computing, 64:65 -- 80, 2017. High-End Computing for Next-Generation Scientific Discovery.","journal-title":"Parallel Computing"},{"key":"e_1_3_2_2_141_1","doi-asserted-by":"publisher","DOI":"10.1364\/OE.16.015915"},{"key":"e_1_3_2_2_142_1","doi-asserted-by":"publisher","DOI":"10.1364\/OE.26.016022"},{"key":"e_1_3_2_2_143_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356145"},{"key":"e_1_3_2_2_144_1","doi-asserted-by":"publisher","DOI":"10.1109\/OIC.2018.8422036"},{"key":"e_1_3_2_2_145_1","doi-asserted-by":"publisher","DOI":"10.21236\/ADA594171"},{"key":"e_1_3_2_2_146_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02614369"},{"key":"e_1_3_2_2_147_1","doi-asserted-by":"publisher","DOI":"10.5555\/3112670.3113031"}],"event":{"name":"SIGCOMM '21: ACM SIGCOMM 2021 Conference","location":"Virtual Event USA","acronym":"SIGCOMM '21","sponsor":["SIGCOMM ACM Special Interest Group on Data Communication"]},"container-title":["Proceedings of the 2021 ACM SIGCOMM 2021 Conference"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3452296.3472900","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3452296.3472900","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3452296.3472900","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T22:01:13Z","timestamp":1750197673000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3452296.3472900"}},"subtitle":["high-bandwidth optical network interconnects for machine learning training"],"short-title":[],"issued":{"date-parts":[[2021,8,9]]},"references-count":147,"alternative-id":["10.1145\/3452296.3472900","10.1145\/3452296"],"URL":"https:\/\/doi.org\/10.1145\/3452296.3472900","relation":{},"subject":[],"published":{"date-parts":[[2021,8,9]]},"assertion":[{"value":"2021-08-09","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}