{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T10:03:19Z","timestamp":1775815399698,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":66,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T00:00:00Z","timestamp":1716940800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,29]]},"DOI":"10.1145\/3636534.3649363","type":"proceedings-article","created":{"date-parts":[[2024,5,29]],"date-time":"2024-05-29T13:32:55Z","timestamp":1716989575000},"page":"312-326","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":24,"title":["Asteroid: Resource-Efficient Hybrid Pipeline Parallelism for Collaborative DNN Training on Heterogeneous Edge Devices"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-8867-0655","authenticated-orcid":false,"given":"Shengyuan","family":"Ye","sequence":"first","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4800-8768","authenticated-orcid":false,"given":"Liekang","family":"Zeng","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9745-4372","authenticated-orcid":false,"given":"Xiaowen","family":"Chu","sequence":"additional","affiliation":[{"name":"Data Science and Analytics Thrust, HKUST(GZ), Guangzhou, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1772-7751","authenticated-orcid":false,"given":"Guoliang","family":"Xing","sequence":"additional","affiliation":[{"name":"The Chinese University of Hong Kong, Hong Kong SAR, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9943-6020","authenticated-orcid":false,"given":"Xu","family":"Chen","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University, Guangzhou, China"}]}],"member":"320","published-online":{"date-parts":[[2024,5,29]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2017. Jetson-TX2. https:\/\/developer.nvidia.com\/embedded\/jetson-tx2."},{"key":"e_1_3_2_1_2_1","unstructured":"2019. Jetson-Nano. https:\/\/developer.nvidia.com\/embedded\/jetson-nano-developer-kit."},{"key":"e_1_3_2_1_3_1","unstructured":"2019. Jetson-NX. https:\/\/developer.nvidia.com\/blog\/jetson-xavier-nx-the-worlds-smallest-ai-supercomputer."},{"key":"e_1_3_2_1_4_1","unstructured":"2019. PyTorch. https:\/\/github.com\/pytorch\/pytorch."},{"key":"e_1_3_2_1_5_1","unstructured":"2019. PyTorch DDP. https:\/\/pytorch.org\/docs\/stable\/_modules\/torch\/nn\/parallel\/distributed.html."},{"key":"e_1_3_2_1_6_1","unstructured":"2021. On-device training with tensorflow lite. https:\/\/www.tensorflow.org\/lite\/examples\/on_device_training\/overview."},{"key":"e_1_3_2_1_7_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Bhardwaj Romil","year":"2022","unstructured":"Romil Bhardwaj, Zhengxu Xia, Ganesh Ananthanarayanan, Junchen Jiang, Yuanchao Shu, Nikolaos Karianakis, Kevin Hsieh, Paramvir Bahl, and Ion Stoica. 2022. Ekya: Continuous learning of video analytics models on edge compute servers. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). 119--135."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2994551.2994564"},{"key":"e_1_3_2_1_9_1","volume-title":"Felix Xiaozhu Lin, and Mengwei Xu","author":"Cai Dongqi","year":"2022","unstructured":"Dongqi Cai, Yaozong Wu, Shangguang Wang, Felix Xiaozhu Lin, and Mengwei Xu. 2022. Autofednlp: An efficient fednlp framework. arXiv preprint arXiv:2205.10162 (2022)."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/MNET.2018.1700146"},{"key":"e_1_3_2_1_11_1","volume-title":"Knowledge distillation for mobile edge computation offloading. arXiv preprint arXiv:2004.04366","author":"Chen Haowei","year":"2020","unstructured":"Haowei Chen, Liekang Zeng, Shuai Yu, and Xu Chen. 2020. Knowledge distillation for mobile edge computation offloading. arXiv preprint arXiv:2004.04366 (2020)."},{"key":"e_1_3_2_1_12_1","volume-title":"Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174","author":"Chen Tianqi","year":"2016","unstructured":"Tianqi Chen, Bing Xu, Chiyuan Zhang, and Carlos Guestrin. 2016. Training deep nets with sublinear memory cost. arXiv preprint arXiv:1604.06174 (2016)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3451211","article-title":"Quantization of Deep Neural Networks for Accurate Edge Computing","volume":"17","author":"Chen Wentao","year":"2021","unstructured":"Wentao Chen, Hailong Qiu, Jian Zhuang, Chutong Zhang, Yu Hu, Qing Lu, Tianchen Wang, Yiyu Shi, Meiping Huang, and Xiaowe Xu. 2021. Quantization of Deep Neural Networks for Accurate Edge Computing. ACM Journal on Emerging Technologies in Computing Systems (JETC) 17, 4 (2021), 1--11.","journal-title":"ACM Journal on Emerging Technologies in Computing Systems (JETC)"},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","unstructured":"Krizhevsky et. al. 2009. CIFAR-10. https:\/\/www.cs.toronto.edu\/~kriz\/cifar.html."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3539765"},{"key":"e_1_3_2_1_18_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. 2017. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677 (2017)."},{"key":"e_1_3_2_1_19_1","volume-title":"EDDL: A Distributed Deep Learning System for Resource-limited Edge Computing Environment. In 2021 IEEE\/ACM Symposium on Edge Computing (SEC). IEEE, 1--13","author":"Hao Pengzhan","year":"2021","unstructured":"Pengzhan Hao and Yifan Zhang. 2021. EDDL: A Distributed Deep Learning System for Resource-limited Edge Computing Environment. In 2021 IEEE\/ACM Symposium on Edge Computing (SEC). IEEE, 1--13."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560551"},{"key":"e_1_3_2_1_22_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. 2019. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_2_1_23_1","first-page":"497","article-title":"Checkmate: Breaking the memory wall with optimal tensor rematerialization","volume":"2","author":"Jain Paras","year":"2020","unstructured":"Paras Jain, Ajay Jain, Aniruddha Nrusimha, Amir Gholami, Pieter Abbeel, Joseph Gonzalez, Kurt Keutzer, and Ion Stoica. 2020. Checkmate: Breaking the memory wall with optimal tensor rematerialization. Proceedings of Machine Learning and Systems 2 (2020), 497--511.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538932"},{"key":"e_1_3_2_1_25_1","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Jia Xianyan","year":"2022","unstructured":"Xianyan Jia, Le Jiang, Ang Wang, Wencong Xiao, Ziji Shi, Jie Zhang, Xinyuan Li, Langshi Chen, Yong Li, Zhen Zheng, et al. 2022. Whale: Efficient Giant Model Training over Heterogeneous {GPUs}. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 673--688."},{"key":"e_1_3_2_1_26_1","first-page":"1","article-title":"Mnn: A universal and efficient inference engine","volume":"2","author":"Jiang Xiaotang","year":"2020","unstructured":"Xiaotang Jiang, Huan Wang, Yiliu Chen, Ziqi Wu, Lichuan Wang, Bin Zou, Yafeng Yang, Zongyang Cui, Yu Cai, Tianhang Yu, et al. 2020. Mnn: A universal and efficient inference engine. Proceedings of Machine Learning and Systems 2 (2020), 1--13.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_27_1","volume-title":"Wei-Han Lee, Kin K Leung, and Leandros Tassiulas.","author":"Jiang Yuang","year":"2022","unstructured":"Yuang Jiang, Shiqiang Wang, Victor Valls, Bong Jun Ko, Wei-Han Lee, Kin K Leung, and Leandros Tassiulas. 2022. Model pruning enables efficient federated learning on edge devices. IEEE Transactions on Neural Networks and Learning Systems (2022)."},{"key":"e_1_3_2_1_28_1","volume-title":"Wei-Han Lee, Kin K Leung, and Leandros Tassiulas.","author":"Jiang Yuang","year":"2022","unstructured":"Yuang Jiang, Shiqiang Wang, Victor Valls, Bong Jun Ko, Wei-Han Lee, Kin K Leung, and Leandros Tassiulas. 2022. Model pruning enables efficient federated learning on edge devices. TNNLS (2022)."},{"key":"e_1_3_2_1_29_1","volume-title":"torchgpipe: On-the-fly pipeline parallelism for training giant models. arXiv preprint arXiv:2004.09910","author":"Kim Chiheon","year":"2020","unstructured":"Chiheon Kim, Heungsub Lee, Myungryong Jeong, Woonhyuk Baek, Boogeon Yoon, Ildoo Kim, Sungbin Lim, and Sungwoong Kim. 2020. torchgpipe: On-the-fly pipeline parallelism for training giant models. arXiv preprint arXiv:2004.09910 (2020)."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303950"},{"key":"e_1_3_2_1_31_1","volume-title":"Communication efficient distributed machine learning with the parameter server. Advances in Neural Information Processing Systems 27","author":"Li Mu","year":"2014","unstructured":"Mu Li, David G Andersen, Alexander J Smola, and Kai Yu. 2014. Communication efficient distributed machine learning with the parameter server. Advances in Neural Information Processing Systems 27 (2014)."},{"key":"e_1_3_2_1_32_1","volume-title":"On-device training under 256kb memory. arXiv preprint arXiv:2206.15472","author":"Lin Ji","year":"2022","unstructured":"Ji Lin, Ligeng Zhu, Wei-Ming Chen, Wei-Chen Wang, Chuang Gan, and Song Han. 2022. On-device training under 256kb memory. arXiv preprint arXiv:2206.15472 (2022)."},{"key":"e_1_3_2_1_33_1","volume-title":"Annual Conference on Neural Information Processing Systems (NeurIPS).","author":"Lin Ji","year":"2022","unstructured":"Ji Lin, Ligeng Zhu, Wei-Ming Chen, Wei-Chen Wang, Chuang Gan, and Song Han. 2022. On-Device Training Under 256KB Memory. In Annual Conference on Neural Information Processing Systems (NeurIPS)."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3560905.3568520"},{"key":"e_1_3_2_1_35_1","volume-title":"Efficient Pipeline Planning for Expedited Distributed DNN Training. arXiv preprint arXiv:2204.10562","author":"Luo Ziyue","year":"2022","unstructured":"Ziyue Luo, Xiaodong Yi, Guoping Long, Shiqing Fan, Chuan Wu, Jun Yang, and Wei Lin. 2022. Efficient Pipeline Planning for Expedited Distributed DNN Training. arXiv preprint arXiv:2204.10562 (2022)."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2017.7927211"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3039714"},{"key":"e_1_3_2_1_38_1","unstructured":"Brendan McMahan Eider Moore Daniel Ramage Seth Hampson and Blaise Aguera y Arcas. 2017. Communication-efficient learning of deep networks from decentralized data. In Artificial intelligence and statistics. PMLR 1273--1282."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458864.3467681"},{"key":"e_1_3_2_1_42_1","volume-title":"2020 USENIX Annual Technical Conference (USENIX ATC 20)","author":"Park Jay H","year":"2020","unstructured":"Jay H Park, Gyeongchan Yun, M Yi Chang, Nguyen T Nguyen, Seungmin Lee, Jaesik Choi, Sam H Noh, and Young-ri Choi. 2020. {HetPipe}: Enabling large {DNN} training on (whimpy) heterogeneous {GPU} clusters through integration of pipelined model parallelism and data parallelism. In 2020 USENIX Annual Technical Conference (USENIX ATC 20). 307--321."},{"key":"e_1_3_2_1_43_1","volume-title":"POET: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging. In International Conference on Machine Learning. PMLR, 17573--17583","author":"Patil Shishir G","year":"2022","unstructured":"Shishir G Patil, Paras Jain, Prabal Dutta, Ion Stoica, and Joseph Gonzalez. 2022. POET: Training Neural Networks on Tiny Devices with Integrated Rematerialization and Paging. In International Conference on Machine Learning. PMLR, 17573--17583."},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/MPRV.2009.82"},{"key":"e_1_3_2_1_47_1","volume-title":"Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799","author":"Sergeev Alexander","year":"2018","unstructured":"Alexander Sergeev and Mike Del Balso. 2018. Horovod: fast and easy distributed deep learning in TensorFlow. arXiv preprint arXiv:1802.05799 (2018)."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPSN54338.2022.00029"},{"key":"e_1_3_2_1_49_1","volume-title":"International conference on machine learning. PMLR, 6105--6114","author":"Tan Mingxing","year":"2019","unstructured":"Mingxing Tan and Quoc Le. 2019. Efficientnet: Rethinking model scaling for convolutional neural networks. In International conference on machine learning. PMLR, 6105--6114."},{"key":"e_1_3_2_1_50_1","unstructured":"DeepSpeed Team and Rangan Majumder. 2020. DeepSpeed: Extreme-scale model training for everyone."},{"key":"e_1_3_2_1_51_1","volume-title":"Multi-DNN Accelerators for Next-Generation AI Systems. arXiv preprint arXiv:2205.09376","author":"Venieris Stylianos I","year":"2022","unstructured":"Stylianos I Venieris, Christos-Savvas Bouganis, and Nicholas D Lane. 2022. Multi-DNN Accelerators for Next-Generation AI Systems. arXiv preprint arXiv:2205.09376 (2022)."},{"key":"e_1_3_2_1_52_1","unstructured":"Oriol Vinyals Charles Blundell Timothy Lillicrap Daan Wierstra et al. 2016. Matching networks for one shot learning. Advances in neural information processing systems 29 (2016)."},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3498361.3538928"},{"key":"e_1_3_2_1_54_1","volume-title":"Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 1--6.","author":"Wei Yuanxin","year":"2024","unstructured":"Yuanxin Wei, Shengyuan Ye, Jiazhi Jiang, Xu Chen, Dan Huang, Jiangsu Du, and Yutong Lu. 2024. Communication-Efficient Model Parallelism for Distributed In-situ Transformer Inference. In 2024 Design, Automation & Test in Europe Conference & Exhibition (DATE). IEEE, 1--6."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2023.3238049"},{"key":"e_1_3_2_1_56_1","volume-title":"Asynchronous federated optimization. arXiv preprint arXiv:1903.03934","author":"Xie Cong","year":"2019","unstructured":"Cong Xie, Sanmi Koyejo, and Indranil Gupta. 2019. Asynchronous federated optimization. arXiv preprint arXiv:1903.03934 (2019)."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3495243.3560545"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3287075"},{"key":"e_1_3_2_1_59_1","first-page":"r2","article-title":"Elfish: Resource-aware federated learning on heterogeneous edge devices","volume":"2","author":"Xu Zirui","year":"2019","unstructured":"Zirui Xu, Zhao Yang, Jinjun Xiong, Janlei Yang, and Xiang Chen. 2019. Elfish: Resource-aware federated learning on heterogeneous edge devices. Ratio 2, r1 (2019), r2.","journal-title":"Ratio"},{"key":"e_1_3_2_1_60_1","volume-title":"Galaxy: A Resource-Efficient Collaborative Edge AI System for In-situ Transformer Inference. In IEEE INFOCOM 2024-IEEE Conference on Computer Communications.","author":"Ye Shengyuan","year":"2024","unstructured":"Shengyuan Ye, Jiangsu Du, Liekang Zeng, Wenzhong Ou, Xiaowen Chu, Yutong Lu, and Xu Chen. 2024. Galaxy: A Resource-Efficient Collaborative Edge AI System for In-situ Transformer Inference. In IEEE INFOCOM 2024-IEEE Conference on Computer Communications."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3545008.3545015"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1109\/TNET.2020.3042320"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511982"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2858384"},{"key":"e_1_3_2_1_65_1","volume-title":"Alpa: Automating Inter-and IntraOperator Parallelism for Distributed Deep Learning. arXiv preprint arXiv:2201.12023","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Joseph E Gonzalez, et al. 2022. Alpa: Automating Inter-and IntraOperator Parallelism for Distributed Deep Learning. arXiv preprint arXiv:2201.12023 (2022)."},{"key":"e_1_3_2_1_66_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2019.2918951"}],"event":{"name":"ACM MobiCom '24: 30th Annual International Conference on Mobile Computing and Networking","location":"Washington D.C. DC USA","acronym":"ACM MobiCom '24","sponsor":["SIGMOBILE ACM Special Interest Group on Mobility of Systems, Users, Data and Computing"]},"container-title":["Proceedings of the 30th Annual International Conference on Mobile Computing and Networking"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3636534.3649363","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3636534.3649363","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T22:54:12Z","timestamp":1750287252000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3636534.3649363"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,29]]},"references-count":66,"alternative-id":["10.1145\/3636534.3649363","10.1145\/3636534"],"URL":"https:\/\/doi.org\/10.1145\/3636534.3649363","relation":{},"subject":[],"published":{"date-parts":[[2024,5,29]]},"assertion":[{"value":"2024-05-29","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}