{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T17:59:27Z","timestamp":1775671167047,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":65,"publisher":"ACM","license":[{"start":{"date-parts":[[2018,10,11]],"date-time":"2018-10-11T00:00:00Z","timestamp":1539216000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100000001","name":"National Science Foundation","doi-asserted-by":"publisher","award":["1723352"],"award-info":[{"award-number":["1723352"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2018,10,11]]},"DOI":"10.1145\/3267809.3267840","type":"proceedings-article","created":{"date-parts":[[2018,9,28]],"date-time":"2018-09-28T18:00:41Z","timestamp":1538157641000},"page":"41-54","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":92,"title":["Parameter Hub"],"prefix":"10.1145","author":[{"given":"Liang","family":"Luo","sequence":"first","affiliation":[{"name":"University of Washington"}]},{"given":"Jacob","family":"Nelson","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Luis","family":"Ceze","sequence":"additional","affiliation":[{"name":"University of Washington"}]},{"given":"Amar","family":"Phanishayee","sequence":"additional","affiliation":[{"name":"Microsoft Research"}]},{"given":"Arvind","family":"Krishnamurthy","sequence":"additional","affiliation":[{"name":"University of Washington"}]}],"member":"320","published-online":{"date-parts":[[2018,10,11]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AMD EPYC. http:\/\/www.amd.com\/en\/products\/epyc. AMD EPYC. http:\/\/www.amd.com\/en\/products\/epyc."},{"key":"e_1_3_2_1_2_1","unstructured":"Apache mxnet on aws. https:\/\/aws.amazon.com\/mxnet\/. (Accessed on 05\/09\/2018). Apache mxnet on aws. https:\/\/aws.amazon.com\/mxnet\/. (Accessed on 05\/09\/2018)."},{"key":"e_1_3_2_1_3_1","unstructured":"Arista 7060cx-32s price. https:\/\/goo.gl\/cqyBtA. Arista 7060cx-32s price. https:\/\/goo.gl\/cqyBtA."},{"key":"e_1_3_2_1_4_1","unstructured":"Azure Windows VM sizes - HPC. https:\/\/docs.microsoft.com\/en-us\/azure\/virtual-machines\/windows\/sizes-hpc. (Accessed on 01\/11\/2018). Azure Windows VM sizes - HPC. https:\/\/docs.microsoft.com\/en-us\/azure\/virtual-machines\/windows\/sizes-hpc. (Accessed on 01\/11\/2018)."},{"key":"e_1_3_2_1_5_1","unstructured":"baidu-research\/baidu-allreduce. https:\/\/github.com\/baidu-research\/baidu-allreduce. (Accessed on 05\/14\/2018). baidu-research\/baidu-allreduce. https:\/\/github.com\/baidu-research\/baidu-allreduce. (Accessed on 05\/14\/2018)."},{"key":"e_1_3_2_1_6_1","unstructured":"Cloud tpus - ml accelerators for tensorflow \u00c2\u0103|\u00c2\u0103 google cloud. https:\/\/cloud.google.com\/tpu\/. (Accessed on 05\/16\/2018). Cloud tpus - ml accelerators for tensorflow \u00c2\u0103|\u00c2\u0103 google cloud. https:\/\/cloud.google.com\/tpu\/. (Accessed on 05\/16\/2018)."},{"key":"e_1_3_2_1_7_1","unstructured":"Distributed training | caffe2. https:\/\/caffe2.ai\/docs\/distributed-training.html. (Accessed on 05\/09\/2018). Distributed training | caffe2. https:\/\/caffe2.ai\/docs\/distributed-training.html. (Accessed on 05\/09\/2018)."},{"key":"e_1_3_2_1_8_1","unstructured":"Ec2instances.info Easy Amazon EC2 instance comparison. https:\/\/www.ec2instances.info\/?region=us-west-2. Ec2instances.info Easy Amazon EC2 instance comparison. https:\/\/www.ec2instances.info\/?region=us-west-2."},{"key":"e_1_3_2_1_9_1","unstructured":"Epyc benchmarks. https:\/\/www.amd.com\/en\/products\/epycbenchmarks. Epyc benchmarks. https:\/\/www.amd.com\/en\/products\/epycbenchmarks."},{"key":"e_1_3_2_1_10_1","unstructured":"Machine learning | microsoft azure. https:\/\/azure.microsoft.com\/en-us\/services\/machine-learning-studio\/. (Accessed on 05\/16\/2018). Machine learning | microsoft azure. https:\/\/azure.microsoft.com\/en-us\/services\/machine-learning-studio\/. (Accessed on 05\/16\/2018)."},{"key":"e_1_3_2_1_11_1","unstructured":"Mellanox ethernet cable prices. https:\/\/store.mellanox.com\/categories\/interconnect\/ethernet-cables\/direct-attach-copper-cables.html. Mellanox ethernet cable prices. https:\/\/store.mellanox.com\/categories\/interconnect\/ethernet-cables\/direct-attach-copper-cables.html."},{"key":"e_1_3_2_1_12_1","unstructured":"Mellanox ethernet card prices. https:\/\/store.mellanox.com\/categories\/adapters\/ethernet-adapter-cards.html. Mellanox ethernet card prices. https:\/\/store.mellanox.com\/categories\/adapters\/ethernet-adapter-cards.html."},{"key":"e_1_3_2_1_13_1","unstructured":"Mxnet on the cloud \u00e2\u0102\u0164 mxnet documentation. https:\/\/mxnet.incubator.apache.org\/faq\/cloud.html?highlight=ec2. (Accessed on 05\/09\/2018). Mxnet on the cloud \u00e2\u0102\u0164 mxnet documentation. https:\/\/mxnet.incubator.apache.org\/faq\/cloud.html?highlight=ec2. (Accessed on 05\/09\/2018)."},{"key":"e_1_3_2_1_14_1","unstructured":"Nvidia 1080 ti advertised price. https:\/\/www.nvidia.com\/en-us\/geforce\/products\/10series\/geforce-gtx-1080-ti. Nvidia 1080 ti advertised price. https:\/\/www.nvidia.com\/en-us\/geforce\/products\/10series\/geforce-gtx-1080-ti."},{"key":"e_1_3_2_1_15_1","unstructured":"Performance of distributed deep learning using ChainerMN. https:\/\/chainer.org\/general\/2017\/02\/08\/Performance-of-Distributed-Deep-Learning-Using-ChainerMN.html. Performance of distributed deep learning using ChainerMN. https:\/\/chainer.org\/general\/2017\/02\/08\/Performance-of-Distributed-Deep-Learning-Using-ChainerMN.html."},{"key":"e_1_3_2_1_16_1","unstructured":"Supermicro phub node price. https:\/\/www.thinkmate.com\/system\/superserver-6038r-txr. Supermicro phub node price. https:\/\/www.thinkmate.com\/system\/superserver-6038r-txr."},{"key":"e_1_3_2_1_17_1","unstructured":"Supermicro worker node price. https:\/\/www.thinkmate.com\/system\/superserver-1028gq-tr. Supermicro worker node price. https:\/\/www.thinkmate.com\/system\/superserver-1028gq-tr."},{"key":"e_1_3_2_1_18_1","unstructured":"ZMQ distributed messaging. http:\/\/zeromq.org\/. ZMQ distributed messaging. http:\/\/zeromq.org\/."},{"key":"e_1_3_2_1_19_1","first-page":"265","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi , Paul Barham , Jianmin Chen , Zhifeng Chen , Andy Davis , Jeffrey Dean , Matthieu Devin , Sanjay Ghemawat , Geoffrey Irving , Michael Isard , Manjunath Kudlur , Josh Levenberg , Rajat Monga , Sherry Moore , Derek G. Murray , Benoit Steiner , Paul Tucker , Vijay Vasudevan , Pete Warden , Martin Wicke , Yuan Yu , and Xiaoqiang Zheng . Tensorflow : A system for large-scale machine learning . In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16) , pages 265 -- 283 , Savannah, GA , 2016 . USENIX Association. Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. Tensorflow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16), pages 265--283, Savannah, GA, 2016. USENIX Association."},{"key":"e_1_3_2_1_20_1","unstructured":"Michael Alan Aurojit Panda Domenic Bottini Lisa Jian Pranay Kumar and Scott Shenker. Network evolution for dnns. Michael Alan Aurojit Panda Domenic Bottini Lisa Jian Pranay Kumar and Scott Shenker. Network evolution for dnns."},{"key":"e_1_3_2_1_21_1","volume-title":"International Conference on Learning Representations Workshop Track","author":"Chen Jianmin","year":"2016","unstructured":"Jianmin Chen , Rajat Monga , Samy Bengio , and Rafal Jozefowicz . Revisiting distributed synchronous sgd . In International Conference on Learning Representations Workshop Track , 2016 . Jianmin Chen, Rajat Monga, Samy Bengio, and Rafal Jozefowicz. Revisiting distributed synchronous sgd. In International Conference on Learning Representations Workshop Track, 2016."},{"key":"e_1_3_2_1_22_1","volume-title":"MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274","author":"Chen Tianqi","year":"2015","unstructured":"Tianqi Chen , Mu Li , Yutian Li , Min Lin , Naiyan Wang , Minjie Wang , Tianjun Xiao , Bing Xu , Chiyuan Zhang , and Zheng Zhang . MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274 , 2015 . Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. MXNet: A flexible and efficient machine learning library for heterogeneous distributed systems. arXiv preprint arXiv:1512.01274, 2015."},{"key":"e_1_3_2_1_23_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen , Bing Xu , Chiyuan Zhang , and Carlos Guestrin . Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 , 2016 . Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174, 2016."},{"key":"e_1_3_2_1_24_1","first-page":"571","volume-title":"11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14)","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi , Yutaka Suzue , Johnson Apacible , and Karthik Kalyanaraman . Project adam : Building an efficient and scalable deep learning training system . In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14) , pages 571 -- 582 , Broomfield, CO , 2014 . USENIX Association. Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. Project adam: Building an efficient and scalable deep learning training system. In 11th USENIX Symposium on Operating Systems Design and Implementation (OSDI 14), pages 571--582, Broomfield, CO, 2014. USENIX Association."},{"key":"e_1_3_2_1_25_1","first-page":"37","volume-title":"USENIX Annual Technical Conference","author":"Cui Henggang","year":"2014","unstructured":"Henggang Cui , James Cipar , Qirong Ho , Jin Kyu Kim , Seunghak Lee , Abhimanu Kumar , Jinliang Wei , Wei Dai , Gregory R Ganger , Phillip B Gibbons , Exploiting bounded staleness to speed up big data analytics . In USENIX Annual Technical Conference , pages 37 -- 48 , 2014 . Henggang Cui, James Cipar, Qirong Ho, Jin Kyu Kim, Seunghak Lee, Abhimanu Kumar, Jinliang Wei, Wei Dai, Gregory R Ganger, Phillip B Gibbons, et al. Exploiting bounded staleness to speed up big data analytics. In USENIX Annual Technical Conference, pages 37--48, 2014."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901323"},{"key":"e_1_3_2_1_27_1","volume-title":"High-performance distributed ML at scale through parameter server consistency models. CoRR, abs\/1410.8043","author":"Dai Wei","year":"2014","unstructured":"Wei Dai , Abhimanu Kumar , Jinliang Wei , Qirong Ho , Garth A. Gibson , and Eric P. Xing . High-performance distributed ML at scale through parameter server consistency models. CoRR, abs\/1410.8043 , 2014 . Wei Dai, Abhimanu Kumar, Jinliang Wei, Qirong Ho, Garth A. Gibson, and Eric P. Xing. High-performance distributed ML at scale through parameter server consistency models. CoRR, abs\/1410.8043, 2014."},{"key":"e_1_3_2_1_28_1","first-page":"1223","volume-title":"Proceedings of the 25th International Conference on Neural Information Processing Systems -","volume":"1","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean , Greg S. Corrado , Rajat Monga , Kai Chen , Matthieu Devin , Quoc V. Le , Mark Z. Mao , Marc'Aurelio Ranzato , Andrew Senior , Paul Tucker , Ke Yang , and Andrew Y. Ng . Large scale distributed deep networks . In Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1 , NIPS'12, pages 1223 -- 1231 , USA, 2012 . Curran Associates Inc. Jeffrey Dean, Greg S. Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Quoc V. Le, Mark Z. Mao, Marc'Aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, and Andrew Y. Ng. Large scale distributed deep networks. In Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1, NIPS'12, pages 1223--1231, USA, 2012. Curran Associates Inc."},{"key":"e_1_3_2_1_29_1","volume-title":"large minibatch SGD: Training ImageNet in 1 hour. arXiv preprint arXiv: 1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal , Piotr Doll\u00e1r , Ross Girshick , Pieter Noordhuis , Lukasz Wesolowski , Aapo Kyrola , Andrew Tulloch , Yangqing Jia , and Kaiming He. Accurate , large minibatch SGD: Training ImageNet in 1 hour. arXiv preprint arXiv: 1706.02677 , 2017 . Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. Accurate, large minibatch SGD: Training ImageNet in 1 hour. arXiv preprint arXiv: 1706.02677, 2017."},{"key":"e_1_3_2_1_30_1","volume-title":"Vl2: A scalable and flexible data center network","author":"Greenberg Albert","year":"2009","unstructured":"Albert Greenberg , James R. Hamilton , Navendu Jain , Srikanth Kandula , Changhoon Kim , Parantap Lahiri , Dave Maltz , Parveen Patel , and Sudipta Sengupta . Vl2: A scalable and flexible data center network . Association for Computing Machinery, Inc. , August 2009 . Albert Greenberg, James R. Hamilton, Navendu Jain, Srikanth Kandula, Changhoon Kim, Parantap Lahiri, Dave Maltz, Parveen Patel, and Sudipta Sengupta. Vl2: A scalable and flexible data center network. Association for Computing Machinery, Inc., August 2009."},{"key":"e_1_3_2_1_31_1","volume-title":"Deep residual learning for image recognition","author":"He Kaiming","year":"2015","unstructured":"Kaiming He , Xiangyu Zhang , Shaoqing Ren , and Jian Sun . Deep residual learning for image recognition , 2015 . Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. Deep residual learning for image recognition, 2015."},{"key":"e_1_3_2_1_32_1","first-page":"1223","volume-title":"Phillip B Gibbons, Garth A Gibson, Greg Ganger, and Eric P Xing. More effective distributed ML via a stale synchronous parallel parameter server. In Advances in neural information processing systems","author":"Ho Qirong","year":"2013","unstructured":"Qirong Ho , James Cipar , Henggang Cui , Seunghak Lee , Jin Kyu Kim , Phillip B Gibbons, Garth A Gibson, Greg Ganger, and Eric P Xing. More effective distributed ML via a stale synchronous parallel parameter server. In Advances in neural information processing systems , pages 1223 -- 1231 , 2013 . Qirong Ho, James Cipar, Henggang Cui, Seunghak Lee, Jin Kyu Kim, Phillip B Gibbons, Garth A Gibson, Greg Ganger, and Eric P Xing. More effective distributed ML via a stale synchronous parallel parameter server. In Advances in neural information processing systems, pages 1223--1231, 2013."},{"key":"e_1_3_2_1_33_1","volume-title":"CARNEGIE-MELLON UNIV PITTSBURGH PA SCHOOL OF COMPUTER SCIENCE","author":"Hu Ningning","year":"2002","unstructured":"Ningning Hu and Peter Steenkiste . Estimating available bandwidth using packet pair probing. Technical report , CARNEGIE-MELLON UNIV PITTSBURGH PA SCHOOL OF COMPUTER SCIENCE , 2002 . Ningning Hu and Peter Steenkiste. Estimating available bandwidth using packet pair probing. Technical report, CARNEGIE-MELLON UNIV PITTSBURGH PA SCHOOL OF COMPUTER SCIENCE, 2002."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSAC.2003.814505"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.284"},{"key":"e_1_3_2_1_36_1","first-page":"437","volume-title":"2016 USENIX Annual Technical Conference (USENIX ATC 16)","author":"Kalia Anuj","year":"2016","unstructured":"Anuj Kalia , Michael Kaminsky , and David G. Andersen . Design guidelines for high performance RDMA systems . In 2016 USENIX Annual Technical Conference (USENIX ATC 16) , pages 437 -- 450 , Denver, CO , 2016 . USENIX Association. Anuj Kalia, Michael Kaminsky, and David G. Andersen. Design guidelines for high performance RDMA systems. In 2016 USENIX Annual Technical Conference (USENIX ATC 16), pages 437--450, Denver, CO, 2016. USENIX Association."},{"key":"e_1_3_2_1_37_1","volume-title":"On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836","author":"Keskar Nitish Shirish","year":"2016","unstructured":"Nitish Shirish Keskar , Dheevatsa Mudigere , Jorge Nocedal , Mikhail Smelyanskiy , and Ping Tak Peter Tang . On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836 , 2016 . Nitish Shirish Keskar, Dheevatsa Mudigere, Jorge Nocedal, Mikhail Smelyanskiy, and Ping Tak Peter Tang. On large-batch training for deep learning: Generalization gap and sharp minima. arXiv preprint arXiv:1609.04836, 2016."},{"key":"e_1_3_2_1_38_1","unstructured":"Alexandros Koliousis Pijika Watcharapichat Matthias Weidlich Paolo Costa and Peter Pietzuch. Crossbow: Scaling deep learning on multigpu servers. Alexandros Koliousis Pijika Watcharapichat Matthias Weidlich Paolo Costa and Peter Pietzuch. Crossbow: Scaling deep learning on multigpu servers."},{"key":"e_1_3_2_1_39_1","series-title":"Lecture Notes in Computer Science, 1524","volume-title":"Efficient backprop in neural networks: Tricks of the trade","author":"LeCun Y","unstructured":"Y LeCun , L Bottou , and G Orr . Efficient backprop in neural networks: Tricks of the trade . Lecture Notes in Computer Science, 1524 . Y LeCun, L Bottou, and G Orr. Efficient backprop in neural networks: Tricks of the trade. Lecture Notes in Computer Science, 1524."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.5555\/2685048.2685095"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/2968826.2968829"},{"key":"e_1_3_2_1_42_1","volume-title":"Deep gradient compression: Reducing the communication bandwidth for distributed training. CoRR, abs\/1712.01887","author":"Lin Yujun","year":"2017","unstructured":"Yujun Lin , Song Han , Huizi Mao , Yu Wang , and William J. Dally . Deep gradient compression: Reducing the communication bandwidth for distributed training. CoRR, abs\/1712.01887 , 2017 . Yujun Lin, Song Han, Huizi Mao, Yu Wang, and William J. Dally. Deep gradient compression: Reducing the communication bandwidth for distributed training. CoRR, abs\/1712.01887, 2017."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037731"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/1592568.1592575"},{"key":"e_1_3_2_1_45_1","first-page":"543","volume-title":"Doklady an SSSR","author":"Nesterov Yurii","year":"1983","unstructured":"Yurii Nesterov . A method for unconstrained convex minimization problem with the rate of convergence o (1\/k2) . In Doklady an SSSR , volume 269 , pages 543 -- 547 , 1983 . Yurii Nesterov. A method for unconstrained convex minimization problem with the rate of convergence o (1\/k2). In Doklady an SSSR, volume 269, pages 543--547, 1983."},{"key":"e_1_3_2_1_46_1","first-page":"693","volume-title":"Advances in neural information processing systems","author":"Recht Benjamin","year":"2011","unstructured":"Benjamin Recht , Christopher Re , Stephen Wright , and Feng Niu . Hogwild: A lock-free approach to parallelizing stochastic gradient descent . In Advances in neural information processing systems , pages 693 -- 701 , 2011 . Benjamin Recht, Christopher Re, Stephen Wright, and Feng Niu. Hogwild: A lock-free approach to parallelizing stochastic gradient descent. In Advances in neural information processing systems, pages 693--701, 2011."},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/2829988.2787472"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","first-page":"696","DOI":"10.7551\/mitpress\/4943.003.0042","volume-title":"Neurocomputing: Foundations of research","author":"Rumelhart David E.","year":"1988","unstructured":"David E. Rumelhart , Geoffrey E. Hinton , and Ronald J. Williams . Neurocomputing: Foundations of research . chapter Learning Representations by Back-propagating Errors, pages 696 -- 699 . MIT Press , Cambridge, MA, USA , 1988 . David E. Rumelhart, Geoffrey E. Hinton, and Ronald J. Williams. Neurocomputing: Foundations of research. chapter Learning Representations by Back-propagating Errors, pages 696--699. MIT Press, Cambridge, MA, USA, 1988."},{"key":"e_1_3_2_1_49_1","volume-title":"Interspeech 2014","author":"Seide Frank","year":"2014","unstructured":"Frank Seide , Hao Fu , Jasha Droppo , Gang Li , and Dong Yu . 1-bit stochastic gradient descent and application to data-parallel distributed training of speech DNNs . In Interspeech 2014 , September 2014 . Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and Dong Yu. 1-bit stochastic gradient descent and application to data-parallel distributed training of speech DNNs. In Interspeech 2014, September 2014."},{"key":"e_1_3_2_1_50_1","volume-title":"Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso . Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799 , 2018 . Alexander Sergeev and Mike Del Balso. Horovod: fast and easy distributed deep learning in tensorflow. CoRR, abs\/1802.05799, 2018."},{"key":"e_1_3_2_1_51_1","volume-title":"Benchmarking state-of-the-art deep learning software tools","author":"Shi Shaohuai","year":"2016","unstructured":"Shaohuai Shi , Qiang Wang , Pengfei Xu , and Xiaowen Chu . Benchmarking state-of-the-art deep learning software tools , 2016 . Shaohuai Shi, Qiang Wang, Pengfei Xu, and Xiaowen Chu. Benchmarking state-of-the-art deep learning software tools, 2016."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/2785956.2787508"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.14778\/1920841.1920931"},{"key":"e_1_3_2_1_54_1","volume-title":"Deepconfig: Automating data center network topologies management with machine learning. CoRR, abs\/1712.03890","author":"Streiffer Christopher","year":"2017","unstructured":"Christopher Streiffer , Huan Chen , Theophilus Benson , and Asim Kadav . Deepconfig: Automating data center network topologies management with machine learning. CoRR, abs\/1712.03890 , 2017 . Christopher Streiffer, Huan Chen, Theophilus Benson, and Asim Kadav. Deepconfig: Automating data center network topologies management with machine learning. CoRR, abs\/1712.03890, 2017."},{"key":"e_1_3_2_1_55_1","volume-title":"inception-resnet and the impact of residual connections on learning. CoRR, abs\/1602.07261","author":"Szegedy Christian","year":"2016","unstructured":"Christian Szegedy , Sergey Ioffe , and Vincent Vanhoucke . Inception-v4 , inception-resnet and the impact of residual connections on learning. CoRR, abs\/1602.07261 , 2016 . Christian Szegedy, Sergey Ioffe, and Vincent Vanhoucke. Inception-v4, inception-resnet and the impact of residual connections on learning. CoRR, abs\/1602.07261, 2016."},{"key":"e_1_3_2_1_56_1","volume-title":"Rethinking the inception architecture for computer vision. CoRR, abs\/1512.00567","author":"Szegedy Christian","year":"2015","unstructured":"Christian Szegedy , Vincent Vanhoucke , Sergey Ioffe , Jonathon Shlens , and Zbigniew Wojna . Rethinking the inception architecture for computer vision. CoRR, abs\/1512.00567 , 2015 . Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, and Zbigniew Wojna. Rethinking the inception architecture for computer vision. CoRR, abs\/1512.00567, 2015."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1177\/1094342005051521"},{"issue":"3","key":"e_1_3_2_1_58_1","first-page":"2","article-title":"Optimal message scheduling for aggregation","volume":"2","author":"Wang Leyuan","year":"2018","unstructured":"Leyuan Wang , Mu Li , Edo Liberty , and Alex J Smola . Optimal message scheduling for aggregation . NETWORKS , 2 ( 3 ): 2 -- 3 , 2018 . Leyuan Wang, Mu Li, Edo Liberty, and Alex J Smola. Optimal message scheduling for aggregation. NETWORKS, 2(3):2--3, 2018.","journal-title":"NETWORKS"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1145\/2806777.2806778"},{"key":"e_1_3_2_1_60_1","volume-title":"Jin Kyu Kim, and Eric P. Xing. Large scale distributed multiclass logistic regression. CoRR, abs\/1409.5705","author":"Xie Pengtao","year":"2014","unstructured":"Pengtao Xie , Jin Kyu Kim, and Eric P. Xing. Large scale distributed multiclass logistic regression. CoRR, abs\/1409.5705 , 2014 . Pengtao Xie, Jin Kyu Kim, and Eric P. Xing. Large scale distributed multiclass logistic regression. CoRR, abs\/1409.5705, 2014."},{"key":"e_1_3_2_1_61_1","volume-title":"Aggregated residual transformations for deep neural networks. CoRR, abs\/1611.05431","author":"Xie Saining","year":"2016","unstructured":"Saining Xie , Ross B. Girshick , Piotr Doll\u00e1r , Zhuowen Tu , and Kaiming He . Aggregated residual transformations for deep neural networks. CoRR, abs\/1611.05431 , 2016 . Saining Xie, Ross B. Girshick, Piotr Doll\u00e1r, Zhuowen Tu, and Kaiming He. Aggregated residual transformations for deep neural networks. CoRR, abs\/1611.05431, 2016."},{"key":"e_1_3_2_1_62_1","unstructured":"Yang You Zhao Zhang Cho-Jui Hsieh James Demmel and Kurt Keutzer. Speeding up imagenet training on supercomputers. Yang You Zhao Zhang Cho-Jui Hsieh James Demmel and Kurt Keutzer. Speeding up imagenet training on supercomputers."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.14778\/2732977.2733001"},{"key":"e_1_3_2_1_64_1","first-page":"181","volume-title":"2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang , Zeyu Zheng , Shizhen Xu , Wei Dai , Qirong Ho , Xiaodan Liang , Zhiting Hu , Jinliang Wei , Pengtao Xie , and Eric P. Xing . Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters . In 2017 USENIX Annual Technical Conference (USENIX ATC 17) , pages 181 -- 193 , Santa Clara, CA , 2017 . USENIX Association. Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric P. Xing. Poseidon: An efficient communication architecture for distributed deep learning on GPU clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17), pages 181--193, Santa Clara, CA, 2017. USENIX Association."},{"key":"e_1_3_2_1_65_1","volume-title":"March","author":"Zhu H.","year":"2018","unstructured":"H. Zhu , M. Akrout , B. Zheng , A. Pelegris , A. Phanishayee , B. Schroeder , and G. Pekhimenko . TBD: Benchmarking and Analyzing Deep Neural Network Training. ArXiv e-prints , March 2018 . H. Zhu, M. Akrout, B. Zheng, A. Pelegris, A. Phanishayee, B. Schroeder, and G. Pekhimenko. TBD: Benchmarking and Analyzing Deep Neural Network Training. ArXiv e-prints, March 2018."}],"event":{"name":"SoCC '18: ACM Symposium on Cloud Computing","location":"Carlsbad CA USA","acronym":"SoCC '18","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3267809.3267840","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3267809.3267840","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3267809.3267840","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T00:44:30Z","timestamp":1750207470000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3267809.3267840"}},"subtitle":["a Rack-Scale Parameter Server for Distributed Deep Neural Network Training"],"short-title":[],"issued":{"date-parts":[[2018,10,11]]},"references-count":65,"alternative-id":["10.1145\/3267809.3267840","10.1145\/3267809"],"URL":"https:\/\/doi.org\/10.1145\/3267809.3267840","relation":{},"subject":[],"published":{"date-parts":[[2018,10,11]]},"assertion":[{"value":"2018-10-11","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}