{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,4]],"date-time":"2026-02-04T18:16:56Z","timestamp":1770229016211,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":44,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,8,7]],"date-time":"2023-08-07T00:00:00Z","timestamp":1691366400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/100007219","name":"Natural Science Foundation of Shanghai","doi-asserted-by":"publisher","award":["23ZR1404900"],"award-info":[{"award-number":["23ZR1404900"]}],"id":[{"id":"10.13039\/100007219","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62150610497, 62172108, 62002066"],"award-info":[{"award-number":["62150610497, 62172108, 62002066"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Major Key Project of PCL"},{"name":"Open Research Projects of Zhejiang Lab","award":["2022QA0AB07"],"award-info":[{"award-number":["2022QA0AB07"]}]},{"name":"Key-Area Research and Development Program of Guangdong Province","award":["2021B0101400001"],"award-info":[{"award-number":["2021B0101400001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,8,7]]},"DOI":"10.1145\/3605573.3605650","type":"proceedings-article","created":{"date-parts":[[2023,9,13]],"date-time":"2023-09-13T16:21:16Z","timestamp":1694622076000},"page":"102-111","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["OSP: Boosting Distributed Model Training with 2-stage Synchronization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-0126-0387","authenticated-orcid":false,"given":"Zixuan","family":"Chen","sequence":"first","affiliation":[{"name":"Fudan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0004-0649-5443","authenticated-orcid":false,"given":"Lei","family":"Shi","sequence":"additional","affiliation":[{"name":"Fudan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6521-6200","authenticated-orcid":false,"given":"Xuandong","family":"Liu","sequence":"additional","affiliation":[{"name":"Fudan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6559-188X","authenticated-orcid":false,"given":"Jiahui","family":"Li","sequence":"additional","affiliation":[{"name":"Fudan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2230-7671","authenticated-orcid":false,"given":"Sen","family":"Liu","sequence":"additional","affiliation":[{"name":"School of Computer Science, Fudan University, China and Institute of FinTech, Fudan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0958-8547","authenticated-orcid":false,"given":"Yang","family":"Xu","sequence":"additional","affiliation":[{"name":"Fudan University, China and Peng Cheng Laboratory, China"}]}],"member":"320","published-online":{"date-parts":[[2023,9,13]]},"reference":[{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3158369"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2021"},{"key":"e_1_3_2_1_4_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer.J","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter\u00a0J Liu, 2020. Exploring the limits of transfer learning with a unified text-to-text transformer.J. Mach. Learn. Res. 21, 140 (2020), 1\u201367.","journal-title":"Mach. Learn. Res."},{"key":"e_1_3_2_1_5_1","volume-title":"Attention is all you need. Advances in neural information processing systems 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1613\/jair.1.12007"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01129"},{"key":"e_1_3_2_1_9_1","volume-title":"9th International Conference on Learning Representations, ICLR 2021","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. In 9th International Conference on Learning Representations, ICLR 2021, Virtual Event, Austria, May 3-7, 2021. OpenReview.net. https:\/\/openreview.net\/forum?id=YicbFdNTTy"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545024"},{"key":"e_1_3_2_1_11_1","unstructured":"Andrew Gibiansky and Greg Diamos. [n. d.]. GitHub - Baidu-Research\/Baidu-allreduce. https:\/\/github.com\/baidu-research\/baidu-allreduce"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.5555\/3488766.3488792"},{"key":"e_1_3_2_1_13_1","volume-title":"ATP: In-network Aggregation for Multi-tenant Learning.. In NSDI, Vol.\u00a021. 741\u2013761.","author":"Lao ChonLam","year":"2021","unstructured":"ChonLam Lao, Yanfang Le, Kshiteej Mahajan, Yixi Chen, Wenfei Wu, Aditya Akella, and Michael\u00a0M Swift. 2021. ATP: In-network Aggregation for Multi-tenant Learning.. In NSDI, Vol.\u00a021. 741\u2013761."},{"key":"e_1_3_2_1_14_1","volume-title":"International Conference on Machine Learning. PMLR","author":"Shi Yunpeng","year":"2022","unstructured":"Yunpeng Shi, Cole\u00a0M Wyeth, and Gilad Lerman. 2022. Robust Group Synchronization via Quadratic Programming. In International Conference on Machine Learning. PMLR, 20095\u201320105."},{"key":"e_1_3_2_1_15_1","volume-title":"International Conference on Machine Learning. PMLR, 436\u2013445","author":"Aviv Rotem\u00a0Zamir","year":"2021","unstructured":"Rotem\u00a0Zamir Aviv, Ido Hakimi, Assaf Schuster, and Kfir\u00a0Yehuda Levy. 2021. Asynchronous distributed learning: Adapting to gradient delays without prior knowledge. In International Conference on Machine Learning. PMLR, 436\u2013445."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1006\/jpdc.1994.1085"},{"key":"e_1_3_2_1_17_1","volume-title":"Distributed delayed stochastic optimization. Advances in neural information processing systems 24","author":"Agarwal Alekh","year":"2011","unstructured":"Alekh Agarwal and John\u00a0C Duchi. 2011. Distributed delayed stochastic optimization. Advances in neural information processing systems 24 (2011)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/2342356.2342390"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Zixuan Chen Lei Shi Xuandong Liu Xin Ai Sen Liu and Yang Xu. 2023. Boosting Distributed Machine Learning Training Through Loss-tolerant Transmission Protocol. arxiv:2305.04279\u00a0[cs.DC]","DOI":"10.1109\/IWQoS57198.2023.10188699"},{"key":"e_1_3_2_1_20_1","volume-title":"More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 26","author":"Ho Qirong","year":"2013","unstructured":"Qirong Ho, James Cipar, Henggang Cui, Seunghak Lee, Jin\u00a0Kyu Kim, Phillip\u00a0B Gibbons, Garth\u00a0A Gibson, Greg Ganger, and Eric\u00a0P Xing. 2013. More effective distributed ml via a stale synchronous parallel parameter server. Advances in neural information processing systems 26 (2013)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737587"},{"key":"e_1_3_2_1_22_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00057"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737367"},{"key":"e_1_3_2_1_25_1","volume-title":"Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021","author":"Aji Alham\u00a0Fikri","year":"2017","unstructured":"Alham\u00a0Fikri Aji and Kenneth Heafield. 2017. Sparse communication for distributed gradient descent. arXiv preprint arXiv:1704.05021 (2017)."},{"key":"e_1_3_2_1_26_1","volume-title":"Sparsified SGD with memory. Advances in Neural Information Processing Systems 31","author":"Stich U","year":"2018","unstructured":"Sebastian\u00a0U Stich, Jean-Baptiste Cordonnier, and Martin Jaggi. 2018. Sparsified SGD with memory. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_27_1","volume-title":"Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training. In International Conference on Learning Representations.","author":"Lin Yujun","year":"2018","unstructured":"Yujun Lin, Song Han, Huizi Mao, Yu Wang, and Bill Dally. 2018. Deep Gradient Compression: Reducing the Communication Bandwidth for Distributed Training. In International Conference on Learning Representations."},{"key":"e_1_3_2_1_28_1","volume-title":"On the momentum term in gradient descent learning algorithms. Neural networks 12, 1","author":"Qian Ning","year":"1999","unstructured":"Ning Qian. 1999. On the momentum term in gradient descent learning algorithms. Neural networks 12, 1 (1999), 145\u2013151."},{"key":"e_1_3_2_1_29_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS51616.2021.00060"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01152"},{"key":"e_1_3_2_1_32_1","unstructured":"NVIDIA. 2020. CUDA release: 11.2. https:\/\/developer.nvidia.com\/cuda-toolkit"},{"key":"e_1_3_2_1_33_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556","author":"Simonyan Karen","year":"2014","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_34_1","unstructured":"Alex Krizhevsky Geoffrey Hinton 2009. Learning multiple layers of features from tiny images. (2009)."},{"key":"e_1_3_2_1_35_1","volume-title":"2017 2nd international conference on image, vision and computing (ICIVC). IEEE, 783\u2013787","author":"Xia Xiaoling","year":"2017","unstructured":"Xiaoling Xia, Cui Xu, and Bing Nan. 2017. Inception-v3 for flower classification. In 2017 2nd international conference on image, vision and computing (ICIVC). IEEE, 783\u2013787."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_37_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_38_1","volume-title":"100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250","author":"Rajpurkar Pranav","year":"2016","unstructured":"Pranav Rajpurkar, Jian Zhang, Konstantin Lopyrev, and Percy Liang. 2016. Squad: 100,000+ questions for machine comprehension of text. arXiv preprint arXiv:1606.05250 (2016)."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDCS.2019.00150"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3040601"},{"key":"e_1_3_2_1_41_1","volume-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149","author":"Han Song","year":"2015","unstructured":"Song Han, Huizi Mao, and William\u00a0J Dally. 2015. Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149 (2015)."},{"key":"e_1_3_2_1_42_1","volume-title":"8-bit approximations for parallelism in deep learning. arXiv preprint arXiv:1511.04561","author":"Dettmers Tim","year":"2015","unstructured":"Tim Dettmers. 2015. 8-bit approximations for parallelism in deep learning. arXiv preprint arXiv:1511.04561 (2015)."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3343180.3343186"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.future.2021.03.006"},{"key":"e_1_3_2_1_45_1","volume-title":"Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701","author":"Sapio Amedeo","year":"2019","unstructured":"Amedeo Sapio, Marco Canini, Chen-Yu Ho, Jacob Nelson, Panos Kalnis, Changhoon Kim, Arvind Krishnamurthy, Masoud Moshref, Dan\u00a0RK Ports, and Peter Richt\u00e1rik. 2019. Scaling distributed machine learning with in-network aggregation. arXiv preprint arXiv:1903.06701 (2019)."}],"event":{"name":"ICPP 2023: 52nd International Conference on Parallel Processing","location":"Salt Lake City UT USA","acronym":"ICPP 2023"},"container-title":["Proceedings of the 52nd International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605650","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3605573.3605650","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T17:49:04Z","timestamp":1750182544000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3605573.3605650"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,8,7]]},"references-count":44,"alternative-id":["10.1145\/3605573.3605650","10.1145\/3605573"],"URL":"https:\/\/doi.org\/10.1145\/3605573.3605650","relation":{},"subject":[],"published":{"date-parts":[[2023,8,7]]},"assertion":[{"value":"2023-09-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}