{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T16:52:17Z","timestamp":1771951937166,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":50,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,2,22]],"date-time":"2022-02-22T00:00:00Z","timestamp":1645488000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61802358"],"award-info":[{"award-number":["61802358"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"HK RGC GRF","award":["17202318"],"award-info":[{"award-number":["17202318"]}]},{"name":"HK RGC GRF","award":["17207117"],"award-info":[{"award-number":["17207117"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,2,28]]},"DOI":"10.1145\/3503222.3507735","type":"proceedings-article","created":{"date-parts":[[2022,2,22]],"date-time":"2022-02-22T20:49:01Z","timestamp":1645562941000},"page":"374-387","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":11,"title":["NASPipe: high performance and reproducible pipeline parallel supernet training via causal synchronous parallelism"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1643-2583","authenticated-orcid":false,"given":"Shixiong","family":"Zhao","sequence":"first","affiliation":[{"name":"University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fanxin","family":"Li","sequence":"additional","affiliation":[{"name":"University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xusheng","family":"Chen","sequence":"additional","affiliation":[{"name":"University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tianxiang","family":"Shen","sequence":"additional","affiliation":[{"name":"University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Li","family":"Chen","sequence":"additional","affiliation":[{"name":"Huawei Technologies, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Sen","family":"Wang","sequence":"additional","affiliation":[{"name":"Huawei Technologies, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Nicholas","family":"Zhang","sequence":"additional","affiliation":[{"name":"Huawei Technologies, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cheng","family":"Li","sequence":"additional","affiliation":[{"name":"University of Science and Technology of China, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Heming","family":"Cui","sequence":"additional","affiliation":[{"name":"University of Hong Kong, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,2,22]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. NVIDIA\/framework-determinism. https:\/\/github.com\/NVIDIA\/framework-determinism"},{"key":"e_1_3_2_1_2_1","volume-title":"Tensorflow: A system for large-scale machine learning. In 12th $USENIX$ symposium on operating systems design and implementation ($OSDI$ 16). 265\u2013283.","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, and Michael Isard. 2016. Tensorflow: A system for large-scale machine learning. In 12th $USENIX$ symposium on operating systems design and implementation ($OSDI$ 16). 265\u2013283."},{"key":"e_1_3_2_1_3_1","volume-title":"14th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 20). 499\u2013514.","author":"Bai Zhihao","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. PipeSwitch: Fast Pipelined Context Switching for Deep Learning Applications. In 14th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 20). 499\u2013514."},{"key":"e_1_3_2_1_4_1","volume-title":"International Conference on Machine Learning. 550\u2013559","author":"Bender Gabriel","year":"2018","unstructured":"Gabriel Bender, Pieter-Jan Kindermans, Barret Zoph, Vijay Vasudevan, and Quoc Le. 2018. Understanding and simplifying one-shot architecture search. In International Conference on Machine Learning. 550\u2013559."},{"key":"e_1_3_2_1_5_1","volume-title":"Proxylessnas: Direct neural architecture search on target task and hardware. arXiv preprint arXiv:1812.00332.","author":"Cai Han","year":"2018","unstructured":"Han Cai, Ligeng Zhu, and Song Han. 2018. Proxylessnas: Direct neural architecture search on target task and hardware. arXiv preprint arXiv:1812.00332."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/CMPCON.1990.63682"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"e_1_3_2_1_8_1","volume-title":"Autogluon-tabular: Robust and accurate automl for structured data. arXiv preprint arXiv:2003.06505.","author":"Erickson Nick","year":"2020","unstructured":"Nick Erickson, Jonas Mueller, Alexander Shirkov, Hang Zhang, Pedro Larroy, Mu Li, and Alexander Smola. 2020. Autogluon-tabular: Robust and accurate automl for structured data. arXiv preprint arXiv:2003.06505."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58517-4_32"},{"key":"e_1_3_2_1_10_1","volume-title":"Sangeetha Abdu Jyothi, and Roy H Campbell","author":"Hashemi Sayed Hadi","year":"2018","unstructured":"Sayed Hadi Hashemi, Sangeetha Abdu Jyothi, and Roy H Campbell. 2018. Tictac: Accelerating distributed deep learning with communication scheduling. arXiv preprint arXiv:1803.03288."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"key":"e_1_3_2_1_12_1","volume-title":"Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, and Yonghui Wu.","author":"Huang Yanping","year":"2018","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, and Yonghui Wu. 2018. Gpipe: Efficient training of giant neural networks using pipeline parallelism. arXiv preprint arXiv:1811.06965."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/3292500.3330648"},{"key":"e_1_3_2_1_14_1","volume-title":"Nimble: Lightweight and Parallel GPU Task Scheduling for Deep Learning. arXiv preprint arXiv:2012.02732.","author":"Kwon Woosuk","year":"2020","unstructured":"Woosuk Kwon, Gyeong-In Yu, Eunji Jeong, and Byung-Gon Chun. 2020. Nimble: Lightweight and Parallel GPU Task Scheduling for Deep Learning. arXiv preprint arXiv:2012.02732."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00850"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_18_1","volume-title":"Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055.","author":"Liu Hanxiao","year":"2018","unstructured":"Hanxiao Liu, Karen Simonyan, and Yiming Yang. 2018. Darts: Differentiable architecture search. arXiv preprint arXiv:1806.09055."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.29"},{"key":"e_1_3_2_1_20_1","volume-title":"Themis: Fair and Efficient $GPU$ Cluster Scheduling. In 17th $USENIX$ Symposium on Networked Systems Design and Implementation ($NSDI$ 20). 289\u2013304.","author":"Mahajan Kshiteej","year":"2020","unstructured":"Kshiteej Mahajan, Arjun Balasubramanian, Arjun Singhvi, Shivaram Venkataraman, Aditya Akella, Amar Phanishayee, and Shuchi Chawla. 2020. Themis: Fair and Efficient $GPU$ Cluster Scheduling. In 17th $USENIX$ Symposium on Networked Systems Design and Implementation ($NSDI$ 20). 289\u2013304."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/966049.781528"},{"key":"e_1_3_2_1_24_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, and Luca Antiga. 2019. Pytorch: An imperative style, high-performance deep learning library. arXiv preprint arXiv:1912.01703."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"e_1_3_2_1_26_1","volume-title":"International Conference on Machine Learning. 4095\u20134104","author":"Pham Hieu","year":"2018","unstructured":"Hieu Pham, Melody Guan, Barret Zoph, Quoc Le, and Jeff Dean. 2018. Efficient neural architecture search via parameters sharing. In International Conference on Machine Learning. 4095\u20134104."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33014780"},{"key":"e_1_3_2_1_29_1","volume-title":"International Conference on Machine Learning. 2902\u20132911","author":"Real Esteban","year":"2017","unstructured":"Esteban Real, Sherry Moore, Andrew Selle, Saurabh Saxena, Yutaka Leon Suematsu, Jie Tan, Quoc V Le, and Alexey Kurakin. 2017. Large-scale evolution of image classifiers. In International Conference on Machine Learning. 2902\u20132911."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/1791194.1791203"},{"key":"e_1_3_2_1_33_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053.","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053."},{"key":"e_1_3_2_1_34_1","volume-title":"International Conference on Machine Learning. 5877\u20135886","author":"So David","year":"2019","unstructured":"David So, Quoc Le, and Chen Liang. 2019. The evolved transformer. In International Conference on Machine Learning. 5877\u20135886."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-46147-8_29"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00293"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01079"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3200691.3178491"},{"key":"e_1_3_2_1_39_1","unstructured":"Qiang Wang Shaohuai Shi Canhui Wang and Xiaowen Chu. 2020. Communication Contention Aware Scheduling of Multiple Deep Learning Training Jobs. arXiv preprint arXiv:2002.10105."},{"key":"e_1_3_2_1_40_1","volume-title":"Gandiva: Introspective cluster scheduling for deep learning. In 13th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 18). 595\u2013610.","author":"Xiao Wencong","year":"2018","unstructured":"Wencong Xiao, Romil Bhardwaj, Ramachandran Ramjee, Muthian Sivathanu, Nipun Kwatra, Zhenhua Han, Pratyush Patel, Xuan Peng, Hanyu Zhao, and Quanlu Zhang. 2018. Gandiva: Introspective cluster scheduling for deep learning. In 13th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 18). 595\u2013610."},{"key":"e_1_3_2_1_41_1","first-page":"3","volume-title":"Proceedings of Machine Learning and Systems","author":"Yang Bowen","year":"2021","unstructured":"Bowen Yang, Jian Zhang, Jonathan Li, Christopher R\u00e9, Christopher Aberger, and Christopher De Sa. 2021. Pipemare: Asynchronous pipeline parallel dnn training. Proceedings of Machine Learning and Systems, 3 (2021)."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00207"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","unstructured":"Zhao You Shulin Feng Dan Su and Dong Yu. 2021. SpeechMoE: Scaling to Large Acoustic Models with Dynamic Routing Mixture of Experts. arXiv preprint arXiv:2105.03036 https:\/\/doi.org\/10.21437\/Interspeech.2021-478","DOI":"10.21437\/Interspeech.2021-478"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020"},{"key":"e_1_3_2_1_45_1","volume-title":"Retiarii: A Deep Learning Exploratory-Training Framework. In 14th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 20). 919\u2013936.","author":"Zhang Quanlu","year":"2020","unstructured":"Quanlu Zhang, Zhenhua Han, Fan Yang, Yuge Zhang, Zhe Liu, Mao Yang, and Lidong Zhou. 2020. Retiarii: A Deep Learning Exploratory-Training Framework. In 14th $USENIX$ Symposium on Operating Systems Design and Implementation ($OSDI$ 20). 919\u2013936."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN48063.2020.00036"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/DSN.2018.00033"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3094364"},{"key":"e_1_3_2_1_49_1","volume-title":"International Conference on Machine Learning. 12707\u201312718","author":"Zhao Yiyang","year":"2021","unstructured":"Yiyang Zhao, Linnan Wang, Yuandong Tian, Rodrigo Fonseca, and Tian Guo. 2021. Few-shot neural architecture search. In International Conference on Machine Learning. 12707\u201312718."},{"key":"e_1_3_2_1_50_1","first-page":"4","article-title":"Parallelized stochastic gradient descent","volume":"4","author":"Zinkevich Martin","year":"2010","unstructured":"Martin Zinkevich, Markus Weimer, Alexander J Smola, and Lihong Li. 2010. Parallelized stochastic gradient descent.. In NIPS. 4, 4.","journal-title":"NIPS."},{"key":"e_1_3_2_1_51_1","unstructured":"Barret Zoph and Quoc V Le. 2016. Neural architecture search with reinforcement learning. arXiv preprint arXiv:1611.01578."}],"event":{"name":"ASPLOS '22: 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Lausanne Switzerland","acronym":"ASPLOS '22","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503222.3507735","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3503222.3507735","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:11:39Z","timestamp":1750191099000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3503222.3507735"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,2,22]]},"references-count":50,"alternative-id":["10.1145\/3503222.3507735","10.1145\/3503222"],"URL":"https:\/\/doi.org\/10.1145\/3503222.3507735","relation":{},"subject":[],"published":{"date-parts":[[2022,2,22]]},"assertion":[{"value":"2022-02-22","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}