{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,10]],"date-time":"2026-01-10T07:45:26Z","timestamp":1768031126480,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":56,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,6,21]],"date-time":"2021-06-21T00:00:00Z","timestamp":1624233600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100010661","name":"Horizon 2020 Framework Programme","doi-asserted-by":"publisher","award":["713673"],"award-info":[{"award-number":["713673"]}],"id":[{"id":"10.13039\/100010661","id-type":"DOI","asserted-by":"publisher"}]},{"name":"la Caixa Foundation","award":["LCF\/BQ\/DI17\/11620059"],"award-info":[{"award-number":["LCF\/BQ\/DI17\/11620059"]}]},{"name":"JST","award":["JPMJAX190C"],"award-info":[{"award-number":["JPMJAX190C"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,6,21]]},"DOI":"10.1145\/3431379.3460644","type":"proceedings-article","created":{"date-parts":[[2021,6,17]],"date-time":"2021-06-17T04:09:26Z","timestamp":1623902966000},"page":"161-173","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":10,"title":["An Oracle for Guiding Large-Scale Model\/Hybrid Parallel Training of Convolutional Neural Networks"],"prefix":"10.1145","author":[{"given":"Albert Njoroge","family":"Kahira","sequence":"first","affiliation":[{"name":"Barcelona Supercomputing Center &amp; Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}]},{"given":"Truong Thao","family":"Nguyen","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology, Tokyo, Japan"}]},{"given":"Leonardo Bautista","family":"Gomez","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center, Barcelona, Spain"}]},{"given":"Ryousei","family":"Takano","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology, Tokyo, Japan"}]},{"given":"Rosa M.","family":"Badia","sequence":"additional","affiliation":[{"name":"Barcelona Supercomputing Center &amp; Universitat Polit\u00e8cnica de Catalunya, Barcelona, Spain"}]},{"given":"Mohamed","family":"Wahib","sequence":"additional","affiliation":[{"name":"National Institute of Advanced Industrial Science and Technology &amp; RIKEN-CCS, Tokyo, Japan"}]}],"member":"320","published-online":{"date-parts":[[2021,6,21]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"ChainerMN: Scalable Distributed Deep Learning Framework. In Workshop on ML Systems in NIPS.","author":"Takuya","unstructured":"Takuya Akiba et al. 2017 . ChainerMN: Scalable Distributed Deep Learning Framework. In Workshop on ML Systems in NIPS. Takuya Akiba et al. 2017. ChainerMN: Scalable Distributed Deep Learning Framework. In Workshop on ML Systems in NIPS."},{"key":"e_1_3_2_1_2_1","volume-title":"Scalable Reduction Collectives with Data Partitioning-based Multi-leader Design (SC '17)","author":"Mohammadreza","unstructured":"Mohammadreza Bayatpour et al. 2017 . Scalable Reduction Collectives with Data Partitioning-based Multi-leader Design (SC '17) . Article 64, 11 pages. Mohammadreza Bayatpour et al. 2017. Scalable Reduction Collectives with Data Partitioning-based Multi-leader Design (SC '17). Article 64, 11 pages."},{"key":"e_1_3_2_1_3_1","unstructured":"Tal Ben-Nun etal 2018. Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis. CoRR abs\/1802.09941 (2018). arXiv:1802.09941  Tal Ben-Nun et al. 2018. Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis. CoRR abs\/1802.09941 (2018). arXiv:1802.09941"},{"key":"e_1_3_2_1_4_1","volume-title":"Brown et al","author":"Tom","year":"2020","unstructured":"Tom B. Brown et al . 2020 . Language Models are Few-Shot Learners . arXiv preprint arXiv:2005.14165 (2020). arXiv:2005.14165 [cs.CL] Tom B. Brown et al. 2020. Language Models are Few-Shot Learners. arXiv preprint arXiv:2005.14165 (2020). arXiv:2005.14165 [cs.CL]"},{"key":"e_1_3_2_1_5_1","volume-title":"Analysis of Model Parallelism for Distributed Neural Networks (EuroMPI '19)","author":"Adri\u00e1n","unstructured":"Adri\u00e1n Castell\u00f3 et al. 2019 . Analysis of Model Parallelism for Distributed Neural Networks (EuroMPI '19) . Article 7, 10 pages. Adri\u00e1n Castell\u00f3 et al. 2019. Analysis of Model Parallelism for Distributed Neural Networks (EuroMPI '19). Article 7, 10 pages."},{"key":"e_1_3_2_1_6_1","unstructured":"Tianqi Chen et al. 2016. Training Deep Nets with Sublinear Memory Cost. ArXiv abs\/1604.06174 (2016).  Tianqi Chen et al. 2016. Training Deep Nets with Sublinear Memory Cost. ArXiv abs\/1604.06174 (2016)."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"Sudheer Chunduri et al. 2019. GPCNeT: Designing a Benchmark Suite for Inducing and Measuring Contention in HPC Networks (SC '19). Article 42 33 pages.  Sudheer Chunduri et al. 2019. GPCNeT: Designing a Benchmark Suite for Inducing and Measuring Contention in HPC Networks (SC '19). Article 42 33 pages.","DOI":"10.1145\/3295500.3356215"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"crossref","unstructured":"Marcin Copik et al. 2020. Extracting Clean Performance Models from Tainted Programs. arXiv:2012.15592 [cs.DC]  Marcin Copik et al. 2020. Extracting Clean Performance Models from Tainted Programs. arXiv:2012.15592 [cs.DC]","DOI":"10.1145\/3437801.3441613"},{"key":"e_1_3_2_1_9_1","volume-title":"Proceedings of the 25th International Conference on Neural Information Processing Systems","author":"Jeffrey","unstructured":"Jeffrey Dean et al. 2012. Large Scale Distributed Deep Networks . In Proceedings of the 25th International Conference on Neural Information Processing Systems ( Lake Tahoe, Nevada). 1223--1231. Jeffrey Dean et al. 2012. Large Scale Distributed Deep Networks. In Proceedings of the 25th International Conference on Neural Information Processing Systems (Lake Tahoe, Nevada). 1223--1231."},{"key":"e_1_3_2_1_10_1","unstructured":"Jens Domke et al. 2019. HyperX Topology: First at-Scale Implementation and Comparison to the Fat-Tree (SC '19). Article 40 23 pages.  Jens Domke et al. 2019. HyperX Topology: First at-Scale Implementation and Comparison to the Fat-Tree (SC '19). Article 40 23 pages."},{"key":"e_1_3_2_1_11_1","unstructured":"Ge Dong et al. 2020. Fully Convolutional Spatio-Temporal Models for Representation Learning in Plasma Science. arXiv preprint arXiv:2007.10468 (2020). arXiv:2007.10468 [physics.comp-ph]  Ge Dong et al. 2020. Fully Convolutional Spatio-Temporal Models for Representation Learning in Plasma Science. arXiv preprint arXiv:2007.10468 (2020). arXiv:2007.10468 [physics.comp-ph]"},{"key":"e_1_3_2_1_12_1","volume-title":"EFLOPS: Algorithm and System Co-Design for a High Performance Distributed Training Platform. In HPCA. 610--622.","author":"J. Dong","year":"2020","unstructured":"J. Dong et al. 2020 . EFLOPS: Algorithm and System Co-Design for a High Performance Distributed Training Platform. In HPCA. 610--622. J. Dong et al. 2020. EFLOPS: Algorithm and System Co-Design for a High Performance Distributed Training Platform. In HPCA. 610--622."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS.2019.00031"},{"key":"e_1_3_2_1_14_1","volume-title":"Channel and Filter Parallelism for Large-Scale CNN Training (SC '19)","author":"Nikoli","unstructured":"Nikoli Dryden et al. 2019 . Channel and Filter Parallelism for Large-Scale CNN Training (SC '19) . Article 46, 13 pages. Nikoli Dryden et al. 2019. Channel and Filter Parallelism for Large-Scale CNN Training (SC '19). Article 46, 13 pages."},{"key":"e_1_3_2_1_15_1","unstructured":"Amir Gholami et al. 2017. Integrated model batch and domain parallelism in training neural networks. arXiv preprint arXiv:1712.04432 (2017).  Amir Gholami et al. 2017. Integrated model batch and domain parallelism in training neural networks. arXiv preprint arXiv:1712.04432 (2017)."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"K. He et al. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778.  K. He et al. 2016. Deep Residual Learning for Image Recognition. In CVPR. 770--778.","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_17_1","unstructured":"Yanping Huang et al. 2018. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. CoRR abs\/1811.06965 (2018). arXiv:1811.06965  Yanping Huang et al. 2018. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. CoRR abs\/1811.06965 (2018). arXiv:1811.06965"},{"key":"e_1_3_2_1_18_1","unstructured":"Xianyan Jia et al. 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. CoRR abs\/1807.11205 (2018). arXiv:1807.11205  Xianyan Jia et al. 2018. Highly Scalable Deep Learning Training System with Mixed-Precision: Training ImageNet in Four Minutes. CoRR abs\/1807.11205 (2018). arXiv:1807.11205"},{"key":"e_1_3_2_1_19_1","unstructured":"Zhihao Jia et al. 2018. Exploring hidden dimensions in parallelizing convolutional neural networks. arXiv preprint arXiv:1802.04924 (2018).  Zhihao Jia et al. 2018. Exploring hidden dimensions in parallelizing convolutional neural networks. arXiv preprint arXiv:1802.04924 (2018)."},{"key":"e_1_3_2_1_20_1","article-title":"Layer-Centric Memory Reuse and Data Migration for Extreme- Scale Deep Learning on Many-Core Architectures","volume":"15","author":"Hai Jin","year":"2018","unstructured":"Hai Jin et al. 2018 . Layer-Centric Memory Reuse and Data Migration for Extreme- Scale Deep Learning on Many-Core Architectures . ACM Trans. Archit. Code Optim. 15 , 3, Article 37 (Sept. 2018). https:\/\/doi.org\/10.1145\/3243904 Hai Jin et al. 2018. Layer-Centric Memory Reuse and Data Migration for Extreme- Scale Deep Learning on Many-Core Architectures. ACM Trans. Archit. Code Optim. 15, 3, Article 37 (Sept. 2018). https:\/\/doi.org\/10.1145\/3243904","journal-title":"ACM Trans. Archit. Code Optim."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Albert Kahira et al. 2018. Training Deep Neural Networks with Low Precision Input Data: A Hurricane Prediction Case Study. In High Performance Computing. Springer International Publishing Cham 562--569.  Albert Kahira et al. 2018. Training Deep Neural Networks with Low Precision Input Data: A Hurricane Prediction Case Study. In High Performance Computing. Springer International Publishing Cham 562--569.","DOI":"10.1007\/978-3-030-02465-9_40"},{"key":"e_1_3_2_1_22_1","unstructured":"Chiheon Kim et al. 2020. torchgpipe: On-the-fly Pipeline Parallelism for Training Giant Models. arXiv preprint arXiv:2004.09910 (2020). arXiv:2004.09910 [cs.DC]  Chiheon Kim et al. 2020. torchgpipe: On-the-fly Pipeline Parallelism for Training Giant Models. arXiv preprint arXiv:2004.09910 (2020). arXiv:2004.09910 [cs.DC]"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1006\/jpdc.2001.1761"},{"key":"e_1_3_2_1_24_1","unstructured":"Alexander Kolesnikov et al. 2019. Big Transfer (BiT): General Visual Representation Learning. arXiv preprint arXiv:1912.11370 (2019). arXiv:1912.11370 [cs.CV]  Alexander Kolesnikov et al. 2019. Big Transfer (BiT): General Visual Representation Learning. arXiv preprint arXiv:1912.11370 (2019). arXiv:1912.11370 [cs.CV]"},{"key":"e_1_3_2_1_25_1","volume-title":"One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997","author":"Krizhevsky Alex","year":"2014","unstructured":"Alex Krizhevsky . 2014. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997 ( 2014 ). Alex Krizhevsky. 2014. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997 (2014)."},{"key":"e_1_3_2_1_26_1","unstructured":"Thorsten Kurth et al. 2018. Exascale Deep Learning for Climate Analytics (SC '18). 51:1--51:12.  Thorsten Kurth et al. 2018. Exascale Deep Learning for Climate Analytics (SC '18). 51:1--51:12."},{"key":"e_1_3_2_1_27_1","unstructured":"Ang Li et al. 2019. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. CoRR abs\/1903.04611 (2019). arXiv:1903.04611  Ang Li et al. 2019. Evaluating Modern GPU Interconnect: PCIe NVLink NV-SLI NVSwitch and GPUDirect. CoRR abs\/1903.04611 (2019). arXiv:1903.04611"},{"key":"e_1_3_2_1_28_1","volume-title":"Benanza: Automatic \"Benchmark Generation to Compute \"Lower-bound\" Latency and Inform Optimizations of Deep Learning Models on GPUs. ArXiv abs\/1911.06922","author":"Cheng Li","year":"2019","unstructured":"Cheng Li et al. 2019 . Benanza: Automatic \"Benchmark Generation to Compute \"Lower-bound\" Latency and Inform Optimizations of Deep Learning Models on GPUs. ArXiv abs\/1911.06922 (2019). Cheng Li et al. 2019. Benanza: Automatic \"Benchmark Generation to Compute \"Lower-bound\" Latency and Inform Optimizations of Deep Learning Models on GPUs. ArXiv abs\/1911.06922 (2019)."},{"key":"e_1_3_2_1_29_1","unstructured":"Sangkug Lym et al. 2019. PruneTrain: Fast Neural Network Training by Dynamic Sparse Model Reconfiguration (SC '19). Article 36 13 pages.  Sangkug Lym et al. 2019. PruneTrain: Fast Neural Network Training by Dynamic Sparse Model Reconfiguration (SC '19). Article 36 13 pages."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Maxim Martinasso et al. 2011. A Contention-Aware Performance Model for HPC-Based Networks: A Case Study of the InfiniBand Network. In Euro-Par 2011 Parallel Processing.  Maxim Martinasso et al. 2011. A Contention-Aware Performance Model for HPC-Based Networks: A Case Study of the InfiniBand Network. In Euro-Par 2011 Parallel Processing.","DOI":"10.1007\/978-3-642-23400-2_10"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Amrita Mathuriya et al. 2018. CosmoFlow: Using Deep Learning to Learn the Universe at Scale (SC '18). Article 65 11 pages.  Amrita Mathuriya et al. 2018. CosmoFlow: Using Deep Learning to Learn the Universe at Scale (SC '18). Article 65 11 pages.","DOI":"10.1109\/SC.2018.00068"},{"key":"e_1_3_2_1_32_1","unstructured":"Deepak Narayanan et al. 2019. PipeDream: Generalized Pipeline Parallelism for DNN Training (SOSP '19). 1--15.  Deepak Narayanan et al. 2019. PipeDream: Generalized Pipeline Parallelism for DNN Training (SOSP '19). 1--15."},{"key":"e_1_3_2_1_33_1","volume-title":"CosmoFlow datasets. https:\/\/portal.nersc.gov\/project\/m3363\/. [15","author":"National Energy Research Scientific Computing Center. [n.d.].","year":"2020","unstructured":"National Energy Research Scientific Computing Center. [n.d.]. CosmoFlow datasets. https:\/\/portal.nersc.gov\/project\/m3363\/. [15 January 2020 ]. National Energy Research Scientific Computing Center. [n.d.]. CosmoFlow datasets. https:\/\/portal.nersc.gov\/project\/m3363\/. [15 January 2020]."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"crossref","unstructured":"Truong Thao Nguyen etal 2018. Hierarchical Distributed-Memory Multi-Leader MPI-Allreduce for Deep Learning Workloads (CANDAR18). 216--222.  Truong Thao Nguyen et al. 2018. Hierarchical Distributed-Memory Multi-Leader MPI-Allreduce for Deep Learning Workloads (CANDAR18). 216--222.","DOI":"10.1109\/CANDARW.2018.00048"},{"key":"e_1_3_2_1_35_1","volume-title":"Multi-GPU and multi-node collective communication primitives. https:\/\/developer.nvidia.com\/ nccl. [01","author":"NVIDIA.","year":"2020","unstructured":"NVIDIA. [n.d.]. Collective Communications Library (NCCL) , Multi-GPU and multi-node collective communication primitives. https:\/\/developer.nvidia.com\/ nccl. [01 April 2020 ]. NVIDIA. [n.d.]. Collective Communications Library (NCCL), Multi-GPU and multi-node collective communication primitives. https:\/\/developer.nvidia.com\/ nccl. [01 April 2020]."},{"key":"e_1_3_2_1_36_1","volume-title":"https:\/\/developer.nvidia.com\/gpudirect. [21","author":"Direct NVIDIA.","year":"2020","unstructured":"NVIDIA. [n.d.]. GPU Direct . https:\/\/developer.nvidia.com\/gpudirect. [21 April 2020 ]. NVIDIA. [n.d.]. GPUDirect. https:\/\/developer.nvidia.com\/gpudirect. [21 April 2020]."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"J. Gregory Pauloski et al. 2020. Convolutional Neural Network Training with Distributed K-FAC. arXiv preprint arXiv:2007.00784 (2020).  J. Gregory Pauloski et al. 2020. Convolutional Neural Network Training with Distributed K-FAC. arXiv preprint arXiv:2007.00784 (2020).","DOI":"10.1109\/SC41405.2020.00098"},{"key":"e_1_3_2_1_38_1","unstructured":"Samyam Rajbhandari et al. 2019. ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. ArXiv abs\/1910.02054 (2019).  Samyam Rajbhandari et al. 2019. ZeRO: Memory Optimization Towards Training A Trillion Parameter Models. ArXiv abs\/1910.02054 (2019)."},{"key":"e_1_3_2_1_39_1","volume-title":"SparCML: High-Performance Sparse Communication for Machine Learning (SC '19)","author":"Cedric","unstructured":"Cedric Renggli et al. 2019 . SparCML: High-Performance Sparse Communication for Machine Learning (SC '19) . Article 11, 15 pages. Cedric Renggli et al. 2019. SparCML: High-Performance Sparse Communication for Machine Learning (SC '19). Article 11, 15 pages."},{"key":"e_1_3_2_1_40_1","volume-title":"Rico-Gallego et al","author":"Juan","year":"2019","unstructured":"Juan A. Rico-Gallego et al . 2019 . A Survey of Communication Performance Models for High-Performance Computing. ACM Comput. Surv. 51, 6, Article 126 (Jan. 2019), 36 pages. Juan A. Rico-Gallego et al. 2019. A Survey of Communication Performance Models for High-Performance Computing. ACM Comput. Surv. 51, 6, Article 126 (Jan. 2019), 36 pages."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"crossref","unstructured":"Olga Russakovsky et al. 2015. Imagenet large scale visual recognition challenge. International journal of computer vision 115 3 (2015) 211--252.  Olga Russakovsky et al. 2015. Imagenet large scale visual recognition challenge. International journal of computer vision 115 3 (2015) 211--252.","DOI":"10.1007\/s11263-015-0816-y"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.parco.2009.09.001"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"crossref","unstructured":"Frank Seide et al. 2014. 1-Bit Stochastic Gradient Descent and Application to Data-Parallel Distributed Training of Speech DNNs (Interspeech 2014).  Frank Seide et al. 2014. 1-Bit Stochastic Gradient Descent and Application to Data-Parallel Distributed Training of Speech DNNs (Interspeech 2014).","DOI":"10.21437\/Interspeech.2014-274"},{"key":"e_1_3_2_1_44_1","volume-title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. CoRR abs\/1502.03167","author":"Ioffe Sergey","year":"2015","unstructured":"Ioffe Sergey et al. 2015 . Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. CoRR abs\/1502.03167 (2015). arXiv:1502.03167 Ioffe Sergey et al. 2015. Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift. CoRR abs\/1502.03167 (2015). arXiv:1502.03167"},{"key":"e_1_3_2_1_45_1","volume-title":"Learning Semantic Representations Using Convolutional Neural Networks for Web Search (WWW '14 Companion). Association for Computing Machinery","author":"Yelong","unstructured":"Yelong Shen et al. 2014 . Learning Semantic Representations Using Convolutional Neural Networks for Web Search (WWW '14 Companion). Association for Computing Machinery , New York, NY, USA, 373--374. Yelong Shen et al. 2014. Learning Semantic Representations Using Convolutional Neural Networks for Web Search (WWW '14 Companion). Association for Computing Machinery, New York, NY, USA, 373--374."},{"key":"e_1_3_2_1_46_1","unstructured":"Mohammad Shoeybi et al. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. ArXiv abs\/1909.08053 (2019).  Mohammad Shoeybi et al. 2019. Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism. ArXiv abs\/1909.08053 (2019)."},{"key":"e_1_3_2_1_47_1","volume-title":"Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR","author":"Karen","year":"2015","unstructured":"Karen Simonyan et al. 2015 . Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR 2015 . Karen Simonyan et al. 2015. Very Deep Convolutional Networks for Large-Scale Image Recognition. In ICLR 2015."},{"key":"e_1_3_2_1_48_1","volume-title":"Proceedings of LearningSys in NIPS.","author":"Seiya","unstructured":"Seiya Tokui et al. 2015. Chainer: a Next-Generation Open Source Framework for Deep Learning . In Proceedings of LearningSys in NIPS. Seiya Tokui et al. 2015. Chainer: a Next-Generation Open Source Framework for Deep Learning. In Proceedings of LearningSys in NIPS."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Mohamed Wahib et al. 2020. Scaling Distributed Deep Learning Workloads beyond the Memory Capacity with KARMA. arXiv preprint arXiv:2008.11421 (2020).  Mohamed Wahib et al. 2020. Scaling Distributed Deep Learning Workloads beyond the Memory Capacity with KARMA. arXiv preprint arXiv:2008.11421 (2020).","DOI":"10.1109\/SC41405.2020.00023"},{"key":"e_1_3_2_1_50_1","unstructured":"Izhar Wallach et al. 2015. AtomNet: A Deep Convolutional Neural Network for Bioactivity Prediction in Structure-based Drug Discovery. arXiv:1510.02855 [cs.LG]  Izhar Wallach et al. 2015. AtomNet: A Deep Convolutional Neural Network for Bioactivity Prediction in Structure-based Drug Discovery. arXiv:1510.02855 [cs.LG]"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_1"},{"key":"e_1_3_2_1_52_1","unstructured":"Yuanzhong Xu et al. 2020. Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training. arXiv preprint arXiv:2004.13336 (2020). arXiv:2004.13336 [cs.DC]  Yuanzhong Xu et al. 2020. Automatic Cross-Replica Sharding of Weight Update in Data-Parallel Training. arXiv preprint arXiv:2004.13336 (2020). arXiv:2004.13336 [cs.DC]"},{"key":"e_1_3_2_1_53_1","unstructured":"Masafumi Yamazaki et al. 2019. Yet Another Accelerated SGD: ResNet-50 Training on ImageNet in 74.7 seconds. CoRR abs\/1903.12650 (2019). arXiv:1903.12650  Masafumi Yamazaki et al. 2019. Yet Another Accelerated SGD: ResNet-50 Training on ImageNet in 74.7 seconds. CoRR abs\/1903.12650 (2019). arXiv:1903.12650"},{"key":"e_1_3_2_1_54_1","volume-title":"Proceedings of the 47th International Conference on Parallel Processing","author":"Yang","year":"2018","unstructured":"Yang You et al. 2018. ImageNet Training in Minutes . In Proceedings of the 47th International Conference on Parallel Processing ( Eugene, OR, USA) (ICPP 2018 ). 1:1--1:10. Yang You et al. 2018. ImageNet Training in Minutes. In Proceedings of the 47th International Conference on Parallel Processing (Eugene, OR, USA) (ICPP 2018). 1:1--1:10."},{"key":"e_1_3_2_1_55_1","volume-title":"Context Encoding for Semantic Segmentation. In CVPR2018","author":"Hang","unstructured":"Hang Zhang et al. 2018 . Context Encoding for Semantic Segmentation. In CVPR2018 . Hang Zhang et al. 2018. Context Encoding for Semantic Segmentation. In CVPR2018."},{"key":"e_1_3_2_1_56_1","unstructured":"Jia Zhihao et al. 2018. Beyond Data and Model Parallelism for Deep Neural Networks. CoRR abs\/1807.05358 (2018). arXiv:1807.05358  Jia Zhihao et al. 2018. Beyond Data and Model Parallelism for Deep Neural Networks. CoRR abs\/1807.05358 (2018). arXiv:1807.05358"}],"event":{"name":"HPDC '21: The 30th International Symposium on High-Performance Parallel and Distributed Computing","location":"Virtual Event Sweden","acronym":"HPDC '21","sponsor":["University of Arizona University of Arizona","SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th International Symposium on High-Performance Parallel and Distributed Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3431379.3460644","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3431379.3460644","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T21:24:46Z","timestamp":1750195486000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3431379.3460644"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,6,21]]},"references-count":56,"alternative-id":["10.1145\/3431379.3460644","10.1145\/3431379"],"URL":"https:\/\/doi.org\/10.1145\/3431379.3460644","relation":{},"subject":[],"published":{"date-parts":[[2021,6,21]]},"assertion":[{"value":"2021-06-21","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}