{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T15:10:24Z","timestamp":1755789024223,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":61,"publisher":"ACM","license":[{"start":{"date-parts":[[2025,3,30]],"date-time":"2025-03-30T00:00:00Z","timestamp":1743292800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/501100006374","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["CCF-2216964"],"award-info":[{"award-number":["CCF-2216964"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,3,30]]},"DOI":"10.1145\/3669940.3707216","type":"proceedings-article","created":{"date-parts":[[2025,2,6]],"date-time":"2025-02-06T12:28:01Z","timestamp":1738844881000},"page":"182-197","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Composing Distributed Computations Through Task and Kernel Fusion"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-0746-066X","authenticated-orcid":false,"given":"Rohan","family":"Yadav","sequence":"first","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5228-1295","authenticated-orcid":false,"given":"Shiv","family":"Sundram","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8364-5784","authenticated-orcid":false,"given":"Wonchan","family":"Lee","sequence":"additional","affiliation":[{"name":"NVIDIA, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6093-7602","authenticated-orcid":false,"given":"Michael","family":"Garland","sequence":"additional","affiliation":[{"name":"NVIDIA, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8928-3032","authenticated-orcid":false,"given":"Michael","family":"Bauer","sequence":"additional","affiliation":[{"name":"NVIDIA, Santa Clara, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3723-9555","authenticated-orcid":false,"given":"Alex","family":"Aiken","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2267-903X","authenticated-orcid":false,"given":"Fredrik","family":"Kjolstad","sequence":"additional","affiliation":[{"name":"Stanford University, Stanford, CA, USA"}]}],"member":"320","published-online":{"date-parts":[[2025,3,30]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Martin","year":"2016","unstructured":"Martin Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek G. Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. 2016. TensorFlow: A system for large-scale machine learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). 265--283. https:\/\/www.usenix.org\/system\/files\/conference\/osdi16\/osdi16-abadi.pdf"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/277650.277721"},{"key":"e_1_3_2_1_3_1","volume-title":"Ullman","author":"Aho Alfred V.","year":"2006","unstructured":"Alfred V. Aho, Monica S. Lam, Ravi Sethi, and Jeffrey D. Ullman. 2006. Compilers: Principles, Techniques, and Tools (2nd Edition). Addison-Wesley Longman Publishing Co., Inc., USA."},{"key":"e_1_3_2_1_4_1","unstructured":"Frances E Allen and John Cocke. 1971. A Catalogue of Optimizing Transformations. (1971)."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/155090.155102"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640366"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-03869-3_80"},{"key":"e_1_3_2_1_8_1","volume-title":"Lois Curfman McInnes","author":"Balay Satish","year":"2022","unstructured":"Satish Balay, Shrirang Abhyankar, Mark F. Adams, Steven Benson, Jed Brown, Peter Brune, Kris Buschelman, Emil M. Constantinescu, Lisandro Dalcin, Alp Dener, Victor Eijkhout, Jacob Faibussowitsch, William D. Gropp, V\u00e1clav Hapla, Tobin Isaac, Pierre Jolivet, Dmitry Karpeev, Dinesh Kaushik, Matthew G. Knepley, Fande Kong, Scott Kruger, Dave A. May, Lois Curfman McInnes, Richard Tran Mills, Lawrence Mitchell, Todd Munson, Jose E. Roman, Karl Rupp, Patrick Sanan, Jason Sarich, Barry F. Smith, Stefano Zampini, Hong Zhang, Hong Zhang, and Junchao Zhang. 2022. PETSc Web page. https:\/\/petsc.org\/. https:\/\/petsc.org\/"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1168917.1168906"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.21105\/jose.00021"},{"key":"e_1_3_2_1_11_1","first-page":"430","article-title":"Pathways: Asynchronous distributed dataflow for ML","volume":"4","author":"Barham Paul","year":"2022","unstructured":"Paul Barham, Aakanksha Chowdhery, Jeff Dean, Sanjay Ghemawat, Steven Hand, Daniel Hurt, Michael Isard, Hyeontaek Lim, Ruoming Pang, Sudip Roy, et al. 2022. Pathways: Asynchronous distributed dataflow for ML. Proceedings of Machine Learning and Systems, Vol. 4 (2022), 430--449.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356175"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2021.3088239"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3572848.3577515"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2012.71"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356173"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3544559"},{"key":"e_1_3_2_1_18_1","volume-title":"High Performance Code Generation in MLIR: An Early Case Study with GEMM. CoRR","author":"Bondhugula Uday","year":"2020","unstructured":"Uday Bondhugula. 2020. High Performance Code Generation in MLIR: An Early Case Study with GEMM. CoRR, Vol. abs\/2003.00532 (2020). showeprint[arXiv]2003.00532 https:\/\/arxiv.org\/abs\/2003.00532"},{"key":"e_1_3_2_1_19_1","unstructured":"Uday Kumar Reddy Bondhugula. 2008a. Effective automatic parallelization and locality optimization using the polyhedral model. Ph. D. Dissertation. USA. Advisor(s) Sadayappan P. AAI3325799."},{"key":"e_1_3_2_1_20_1","unstructured":"Uday Kumar Reddy Bondhugula. 2008b. Effective Automatic Parallelization and Locality Optimization Using the Polyhedral Model. Ph. D. Dissertation. USA. Advisor(s) Sadayappan P. AAI3325799."},{"key":"e_1_3_2_1_21_1","volume-title":"Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang.","author":"Bradbury James","year":"2018","unstructured":"James Bradbury, Roy Frostig, Peter Hawkins, Matthew James Johnson, Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang. 2018. JAX: composable transformations of PythonNumPy programs. http:\/\/github.com\/google\/jax"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/2854038.2854042"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/113445.113457"},{"key":"e_1_3_2_1_24_1","volume-title":"TVM: End-to-End Optimization Stack for Deep Learning. CoRR","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Haichen Shen, Eddie Q. Yan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: End-to-End Optimization Stack for Deep Learning. CoRR, Vol. abs\/1802.04799 (2018). showeprint[arXiv]1802.04799 http:\/\/arxiv.org\/abs\/1802.04799"},{"key":"e_1_3_2_1_25_1","unstructured":"Pi-Yueh Chuang. 2021. TorchSWE: GPU shallow-water equation solver."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/CLUSTER.2015.50"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.1999.807510"},{"key":"e_1_3_2_1_28_1","unstructured":"Dask Authors. 2023. Dask Optimization. Accessed: 2023--10-08."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/2508075.2514878"},{"key":"e_1_3_2_1_30_1","unstructured":"Huseyin M. Elibol. 2022. NumS: Scalable Array Programming for the Cloud. Ph. D. Dissertation. https:\/\/www.proquest.com\/dissertations-theses\/nums-scalable-array-programming-cloud\/docview\/2727269933\/se-2 Copyright - Database copyright ProQuest LLC; ProQuest does not claim copyright in the individual underlying works; Last updated - 2023-03-08."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/165180.165214"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1007\/978--3--662-05372-012"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020--2649--2"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"volume-title":"Proceedings of the 6th International Workshop on Languages and Compilers for Parallel Computing. Springer-Verlag","author":"Kennedy Ken","key":"e_1_3_2_1_35_1","unstructured":"Ken Kennedy and Kathryn S. McKinley. 1993. Maximizing Loop Parallelism and Improving Data Locality via Loop Fusion and Distribution. In Proceedings of the 6th International Workshop on Languages and Compilers for Parallel Computing. Springer-Verlag, Berlin, Heidelberg, 301--320."},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3133901"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.5555\/977395.977673"},{"key":"e_1_3_2_1_38_1","volume-title":"MLIR: A Compiler Infrastructure for the End of Moore's Law. CoRR","author":"Lattner Chris","year":"2020","unstructured":"Chris Lattner, Jacques A. Pienaar, Mehdi Amini, Uday Bondhugula, River Riddle, Albert Cohen, Tatiana Shpeisman, Andy Davis, Nicolas Vasilache, and Oleksandr Zinenko. 2020. MLIR: A Compiler Infrastructure for the End of Moore's Law. CoRR, Vol. abs\/2002.11054 (2020). showeprint[arXiv]2002.11054 https:\/\/arxiv.org\/abs\/2002.11054"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356199"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO53902.2022.9741270"},{"key":"e_1_3_2_1_41_1","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Moritz Philipp","year":"2018","unstructured":"Philipp Moritz, Robert Nishihara, Stephanie Wang, Alexey Tumanov, Richard Liaw, Eric Liang, Melih Elibol, Zongheng Yang, William Paul, Michael I Jordan, et al. 2018. Ray: A distributed framework for emerging {AI} applications. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 561--577."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/2509136.2509518"},{"key":"e_1_3_2_1_44_1","volume-title":"Weld: A common runtime for high performance data analytics.","author":"Palkar Shoumik","year":"2017","unstructured":"Shoumik Palkar, James J Thomas, Anil Shanbhag, Deepak Narayanan, Holger Pirk, Malte Schwarzkopf, Saman Amarasinghe, and Matei Zaharia. 2017. Weld: A common runtime for high performance data analytics. (2017)."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359652"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338497"},{"key":"e_1_3_2_1_48_1","volume-title":"XLA : Compiling Machine Learning for Peak Performance.","author":"Sabne Amit","year":"2020","unstructured":"Amit Sabne. 2020. XLA : Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_2_1_49_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Slaughter Elliott","year":"2020","unstructured":"Elliott Slaughter, Wei Wu, Yuankun Fu, Legend Brandenburg, Nicolai Garcia, Wilhem Kautz, Emily Marx, Kaleb S. Morris, Qinglei Cao, George Bosilca, Seema Mirchandaney, Wonchan Lee, Sean Treichler, Patrick McCormick, and Alex Aiken. 2020. Task bench: a parameterized benchmark for evaluating parallel runtime performance. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC '20). IEEE Press, Article 62, 15 pages."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476175"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/PAW-ATM56565.2022.00007"},{"key":"e_1_3_2_1_52_1","volume-title":"Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, Xi Luo, Dheevatsa Mudigere, Jongsoo Park, Misha Smelyanskiy, and Alex Aiken. 2022. Unity: Accelerating DNN Training Through Joint Optimization of Algebraic Transformations and Parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 267--284. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/unger"},{"key":"e_1_3_2_1_53_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR, Vol. abs\/1802.04730 (2018). showeprint[arXiv]1802.04730 http:\/\/arxiv.org\/abs\/1802.04730"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1016\/0304--3975(90)90147-A"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3632880"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503221.3508434"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/71.97902"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523437"},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis","author":"Yadav Rohan","year":"2022","unstructured":"Rohan Yadav, Alex Aiken, and Fredrik Kjolstad. 2022b. SpDISTAL: Compiling Distributed Sparse Tensor Computations. In Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis (Dallas, Texas) (SC '22). IEEE Press, Article 59, 15 pages."},{"key":"e_1_3_2_1_60_1","volume-title":"Manolis Papadakis, Michael Garland, Alex Aiken, Fredrik Kjolstad, and Michael Bauer.","author":"Yadav Rohan","year":"2023","unstructured":"Rohan Yadav, Wonchan Lee, Melih Elibol, Taylor Lee Patti, Manolis Papadakis, Michael Garland, Alex Aiken, Fredrik Kjolstad, and Michael Bauer. 2023. Legate Sparse: Distributed Sparse Computing in Python. (2023)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854298"}],"event":{"name":"ASPLOS '25: 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems","sponsor":["SIGPLAN ACM Special Interest Group on Programming Languages","SIGOPS ACM Special Interest Group on Operating Systems","SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Rotterdam Netherlands","acronym":"ASPLOS '25"},"container-title":["Proceedings of the 30th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707216","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3669940.3707216","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,21]],"date-time":"2025-08-21T14:52:48Z","timestamp":1755787968000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3669940.3707216"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,3,30]]},"references-count":61,"alternative-id":["10.1145\/3669940.3707216","10.1145\/3669940"],"URL":"https:\/\/doi.org\/10.1145\/3669940.3707216","relation":{},"subject":[],"published":{"date-parts":[[2025,3,30]]},"assertion":[{"value":"2025-03-30","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}