{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:52:41Z","timestamp":1773193961264,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":48,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CNS-2147909,CNS-2211882,CNS-2239351"],"award-info":[{"award-number":["CNS-2147909,CNS-2211882,CNS-2239351"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707220","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"557-571","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["<scp>GraphPipe:<\/scp>\n            Improving Performance and Scalability of DNN Training with Graph Pipeline Parallelism"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6321-2989","authenticated-orcid":false,"given":"Byungsoo","family":"Jeon","sequence":"first","affiliation":[{"name":"NVIDIA, Arlington, VA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-6672-1252","authenticated-orcid":false,"given":"Mengdi","family":"Wu","sequence":"additional","affiliation":[{"name":"Carnegie Mellon Univerisity, Pittsburgh, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-6834-375X","authenticated-orcid":false,"given":"Shiyi","family":"Cao","sequence":"additional","affiliation":[{"name":"UC Berkeley, Berkeley, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8200-3106","authenticated-orcid":false,"given":"Sunghyun","family":"Kim","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4793-9069","authenticated-orcid":false,"given":"Sunghyun","family":"Park","sequence":"additional","affiliation":[{"name":"NVIDIA, Seattle, WA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-5128-5017","authenticated-orcid":false,"given":"Neeraj","family":"Aggarwal","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-2608-6522","authenticated-orcid":false,"given":"Colin","family":"Unger","sequence":"additional","affiliation":[{"name":"Stanford University, Palo Alto, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-5626-4551","authenticated-orcid":false,"given":"Daiyaan","family":"Arfeen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4387-7940","authenticated-orcid":false,"given":"Peiyuan","family":"Liao","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9371-8358","authenticated-orcid":false,"given":"Xupeng","family":"Miao","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0014-6742","authenticated-orcid":false,"given":"Mohammad","family":"Alizadeh","sequence":"additional","affiliation":[{"name":"Massachusetts Institute of Technology, Cambridge, MA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3065-7316","authenticated-orcid":false,"given":"Gregory R.","family":"Ganger","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5744-3940","authenticated-orcid":false,"given":"Tianqi","family":"Chen","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1270-5185","authenticated-orcid":false,"given":"Zhihao","family":"Jia","sequence":"additional","affiliation":[{"name":"Carnegie Mellon University, Pittsburgh, PA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"https:\/\/github.com\/ECP-CANDLE\/Benchmarks\/tree\/master\/Pilot1\/Uno. Accessed: 2023-05-15."},{"key":"e_1_3_2_1_2_1","unstructured":"Ai and compute. https:\/\/openai.com\/research\/ai-and-compute. Accessed: 2023-05-15."},{"key":"e_1_3_2_1_3_1","unstructured":"Gpt-4o. https:\/\/openai.com\/index\/hello-gpt-4o\/. Accessed: 2024-10-09."},{"key":"e_1_3_2_1_4_1","unstructured":"Summit supercomputer. https:\/\/www.olcf.ornl.gov\/summit\/. Accessed: 2023-09-06."},{"key":"e_1_3_2_1_5_1","first-page":"265","volume-title":"12th {USENIX} symposium on operating systems design and implementation ({OSDI} 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et al. Tensorflow: A system for large-scale machine learning. In 12th {USENIX} symposium on operating systems design and implementation ({OSDI} 16), pages 265--283, 2016."},{"key":"e_1_3_2_1_6_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_7_1","volume-title":"Large scale distributed deep networks. Advances in neural information processing systems, 25","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Mark Mao, Marc'aurelio Ranzato, AndrewSenior, Paul Tucker, Ke Yang, et al. Large scale distributed deep networks. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441593"},{"key":"e_1_3_2_1_9_1","volume-title":"large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677","author":"Goyal Priya","year":"2017","unstructured":"Priya Goyal, Piotr Doll\u00e1r, Ross Girshick, Pieter Noordhuis, Lukasz Wesolowski, Aapo Kyrola, Andrew Tulloch, Yangqing Jia, and Kaiming He. Accurate, large minibatch sgd: Training imagenet in 1 hour. arXiv preprint arXiv:1706.02677, 2017."},{"key":"e_1_3_2_1_10_1","unstructured":"Will Douglas Heaven. Gpt-4 is bigger and better than chatgpt-but openai won't say why. https:\/\/www.technologyreview.com\/2023\/03\/14\/1069823. Accessed: 2023-05-15."},{"key":"e_1_3_2_1_11_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, Yonghui Wu, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_12_1","first-page":"45","volume-title":"International Conference for High Performance Computing, Networking, Storage and Analysis (SC)","author":"Jain Arpan","unstructured":"Arpan Jain, Ammar Ahmad Awan, Asmaa M. Aljuhani, Jahanzeb Maqbool Hashmi, Quentin G. Anthony, Hari Subramoni, Dhabaleswar K. Panda, Raghu Machiraju, and Anil Parwani. GEMS: gpu-enabled memory-aware model-parallelism system for distributed DNN training. In International Conference for High Performance Computing, Networking, Storage and Analysis (SC), page 45. IEEE\/ACM, 2020."},{"key":"e_1_3_2_1_13_1","first-page":"4904","volume-title":"International conference on machine learning","author":"Jia Chao","year":"2021","unstructured":"Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc Le, Yun-Hsuan Sung, Zhen Li, and Tom Duerig. Scaling up visual and vision-language representation learning with noisy text supervision. In International conference on machine learning, pages 4904--4916. PMLR, 2021."},{"key":"e_1_3_2_1_14_1","first-page":"1","article-title":"Beyond data and model parallelism for deep neural networks","volume":"1","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. Beyond data and model parallelism for deep neural networks. Proceedings of Machine Learning and Systems, 1:1--13, 2019.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_15_1","volume-title":"One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997","author":"Krizhevsky Alex","year":"2014","unstructured":"Alex Krizhevsky. One weird trick for parallelizing convolutional neural networks. arXiv preprint arXiv:1404.5997, 2014."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1001\/jama.2016.12537"},{"key":"e_1_3_2_1_17_1","volume-title":"Breadth-first pipeline parallelism. arXiv preprint arXiv:2211.05953","author":"Lamy-Poirier Joel","year":"2022","unstructured":"Joel Lamy-Poirier. Breadth-first pipeline parallelism. arXiv preprint arXiv:2211.05953, 2022."},{"key":"e_1_3_2_1_18_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv preprint arXiv:2006.16668, 2020."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_21_1","volume-title":"Hanayo: Harnessing wave-like pipeline parallelism for enhanced large model training efficiency. CoRR, abs\/2308.15762","author":"Liu Ziming","year":"2023","unstructured":"Ziming Liu, Shenggan Cheng, Haotian Zhou, and Yang You. Hanayo: Harnessing wave-like pipeline parallelism for enhanced large model training efficiency. CoRR, abs\/2308.15762, 2023."},{"key":"e_1_3_2_1_22_1","first-page":"2430","volume-title":"International Conference on Machine Learning","author":"Mirhoseini Azalia","year":"2017","unstructured":"Azalia Mirhoseini, Hieu Pham, Quoc V Le, Benoit Steiner, Rasmus Larsen, Yuefeng Zhou, Naveen Kumar, Mohammad Norouzi, Samy Bengio, and Jeff Dean. Device placement optimization with reinforcement learning. In International Conference on Machine Learning, pages 2430--2439. PMLR, 2017."},{"key":"e_1_3_2_1_23_1","volume-title":"High-performance, distributed training of large-scale deep learning recommendation models. arXiv preprint arXiv:2104.05158","author":"Mudigere Dheevatsa","year":"2021","unstructured":"Dheevatsa Mudigere, Yuchen Hao, Jianyu Huang, Andrew Tulloch, Srinivas Sridharan, Xing Liu, Mustafa Ozdal, Jade Nie, Jongsoo Park, Liang Luo, et al. High-performance, distributed training of large-scale deep learning recommendation models. arXiv preprint arXiv:2104.05158, 2021."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_25_1","first-page":"7937","volume-title":"International Conference on Machine Learning","author":"Narayanan Deepak","year":"2021","unstructured":"Deepak Narayanan, Amar Phanishayee, Kaiyu Shi, Xie Chen, and Matei Zaharia. Memory-efficient pipeline-parallel dnn training. In International Conference on Machine Learning, pages 7937--7947. PMLR, 2021."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_27_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G Azzolini et al. Deep learning recommendation model for personalization and recommendation systems. arXiv preprint arXiv:1906.00091 2019."},{"key":"e_1_3_2_1_28_1","volume-title":"Gpt-4 technical report","author":"AI.","year":"2023","unstructured":"OpenAI. Gpt-4 technical report, 2023."},{"key":"e_1_3_2_1_29_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32:8026--8037","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32:8026--8037, 2019."},{"key":"e_1_3_2_1_30_1","volume-title":"Carbon emissions and large neural network training","author":"Patterson David","year":"2021","unstructured":"David Patterson, Joseph Gonzalez, Quoc Le, Chen Liang, Lluis-Miquel Munguia, Daniel Rothchild, David So, Maud Texier, and Jeff Dean. Carbon emissions and large neural network training, 2021."},{"key":"e_1_3_2_1_31_1","unstructured":"Alec Radford Jong Wook Kim Chris Hallacy Aditya Ramesh Gabriel Goh Sandhini Agarwal Girish Sastry Amanda Askell Pamela Mishkin Jack Clark Gretchen Krueger and Ilya Sutskever. Learning transferable visual models from natural language supervision 2021."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_33_1","first-page":"8821","volume-title":"International Conference on Machine Learning","author":"Ramesh Aditya","year":"2021","unstructured":"Aditya Ramesh, Mikhail Pavlov, Gabriel Goh, Scott Gray, Chelsea Voss, Alec Radford, Mark Chen, and Ilya Sutskever. Zero-shot text-to-image generation. In International Conference on Machine Learning, pages 8821--8831. PMLR, 2021."},{"key":"e_1_3_2_1_34_1","volume-title":"Alexander Novikov, Gabriel Barth-Maron, Mai Gimenez, Yury Sulsky, Jackie Kay, Jost Tobias Springenberg, et al. A generalist agent. arXiv preprint arXiv:2205.06175","author":"Reed Scott","year":"2022","unstructured":"Scott Reed, Konrad Zolna, Emilio Parisotto, Sergio Gomez Colmenarejo, Alexander Novikov, Gabriel Barth-Maron, Mai Gimenez, Yury Sulsky, Jackie Kay, Jost Tobias Springenberg, et al. A generalist agent. arXiv preprint arXiv:2205.06175, 2022."},{"key":"e_1_3_2_1_35_1","volume-title":"et al. Mesh-tensorflow: Deep learning for supercomputers. Advances in neural information processing systems, 31","author":"Shazeer Noam","year":"2018","unstructured":"Noam Shazeer, Youlong Cheng, Niki Parmar, Dustin Tran, Ashish Vaswani, Penporn Koanantakool, Peter Hawkins, HyoukJoong Lee, Mingsheng Hong, Cliff Young, et al. Mesh-tensorflow: Deep learning for supercomputers. Advances in neural information processing systems, 31, 2018."},{"key":"e_1_3_2_1_36_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.4018\/978-1-7998-3591-2"},{"key":"e_1_3_2_1_38_1","volume-title":"Linear-time computability of combinatorial problems on series-parallel graphs. Journal of the ACM (JACM), 29(3):623--641","author":"Takamizawa Kazuhiko","year":"1982","unstructured":"Kazuhiko Takamizawa, Takao Nishizeki, and Nobuji Saito. Linear-time computability of combinatorial problems on series-parallel graphs. Journal of the ACM (JACM), 29(3):623--641, 1982."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-020-05173-2"},{"key":"e_1_3_2_1_40_1","first-page":"24829","article-title":"Multidimensional planner for dnn parallelization","volume":"34","author":"Tarnawski Jakub M","year":"2021","unstructured":"Jakub M Tarnawski, Deepak Narayanan, and Amar Phanishayee. Piper: Multidimensional planner for dnn parallelization. Advances in Neural Information Processing Systems, 34:24829--24840, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","volume-title":"Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818","author":"Team Chameleon","year":"2024","unstructured":"Chameleon Team. Chameleon: Mixed-modal early-fusion foundation models. arXiv preprint arXiv:2405.09818, 2024."},{"key":"e_1_3_2_1_42_1","first-page":"267","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et al. Unity: Accelerating dnn training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 267--284, 2022."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/79173.79181"},{"key":"e_1_3_2_1_44_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11633-022-1410-8"},{"key":"e_1_3_2_1_47_1","volume-title":"International Conference on Learning Representations (ICLR)","author":"Yang Pengcheng","year":"2022","unstructured":"Pengcheng Yang, Xiaoming Zhang, Wenpeng Zhang, Ming Yang, and HongWei. Group-based interleaved pipeline parallelism for large-scale DNN training. In International Conference on Learning Representations (ICLR), 2022."},{"key":"e_1_3_2_1_48_1","first-page":"559","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. Alpa: Automating inter-and intra-operator parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 559--578, 2022."}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","location":"Rotterdam Netherlands","acronym":"ASPLOS '25","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707220","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707220","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:46:14Z","timestamp":1755787574000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707220"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":48,"alternative-id":["10.1145\/3669940.3707220","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707220","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}