{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,10]],"date-time":"2026-03-10T04:35:24Z","timestamp":1773117324921,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":63,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,11,4]],"date-time":"2024-11-04T00:00:00Z","timestamp":1730678400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-sa\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["2144796"],"award-info":[{"award-number":["2144796"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,11,4]]},"DOI":"10.1145\/3694715.3695955","type":"proceedings-article","created":{"date-parts":[[2024,11,15]],"date-time":"2024-11-15T19:28:18Z","timestamp":1731698898000},"page":"505-521","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Scaling Deep Learning Computation over the Inter-Core Connected Intelligence Processor with T10"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0006-8171-4970","authenticated-orcid":false,"given":"Yiqi","family":"Liu","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-0363-9486","authenticated-orcid":false,"given":"Yuqi","family":"Xue","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, United States"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9436-674X","authenticated-orcid":false,"given":"Yu","family":"Cheng","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-9524-5476","authenticated-orcid":false,"given":"Lingxiao","family":"Ma","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7466-2128","authenticated-orcid":false,"given":"Ziming","family":"Miao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4495-1997","authenticated-orcid":false,"given":"Jilong","family":"Xue","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1125-671X","authenticated-orcid":false,"given":"Jian","family":"Huang","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign, Urbana, Illinois, United States"}]}],"member":"320","published-online":{"date-parts":[[2024,11,15]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"PartIR: Composing SPMD Partitioning Strategies for Machine Learning. arXiv preprint arXiv:2401.11202","author":"Alabed Sami","year":"2024","unstructured":"Sami Alabed, Daniel Belov, Bart Chrzaszcz, Juliana Franco, Dominik Grewe, Dougal Maclaurin, James Molloy, Tom Natan, Tamara Norman, Xiaoyue Pan, Adam Paszke, Norman A. Rink, Michael Schaarschmidt, Timur Sitdikov, Agnieszka Swietlik, Dimitrios Vytiniotis, and Joel Wee. 2024. PartIR: Composing SPMD Partitioning Strategies for Machine Learning. arXiv preprint arXiv:2401.11202 (2024)."},{"key":"e_1_3_2_1_2_1","volume-title":"Proc. ACM Program. Lang.","author":"Bansal Manya","year":"2023","unstructured":"Manya Bansal, Olivia Hsu, Kunle Olukotun, and Fredrik Kjolstad. 2023. Mosaic: An Interoperable Compiler for Tensor Algebra. Proc. ACM Program. Lang. (2023)."},{"key":"e_1_3_2_1_3_1","volume-title":"Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang.","author":"Bradbury James","year":"2018","unstructured":"James Bradbury, Roy Frostig, Peter Hawkins, Matthew James Johnson, Chris Leary, Dougal Maclaurin, George Necula, Adam Paszke, Jake VanderPlas, Skye Wanderman-Milne, and Qiao Zhang. 2018. JAX: composable transformations of Python+NumPy programs. http:\/\/github.com\/google\/jax."},{"key":"e_1_3_2_1_4_1","unstructured":"Cerebras. 2021. Cerebras Systems Raises $250M in Funding for Over $4B Valuation to Advance the Future of Artificial Intelligence Compute. https:\/\/www.cerebras.net\/press-release\/cerebras-systems-raises-250m-in-funding-for-over-4b-valuation-to-advance-the-future-of-artificial-intelligence-compute\/."},{"key":"e_1_3_2_1_5_1","volume-title":"Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI '18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI '18)."},{"key":"e_1_3_2_1_6_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2019. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv preprint arXiv:1810.04805 (2019)."},{"key":"e_1_3_2_1_7_1","volume-title":"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. 2021. An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale. arXiv preprint arXiv:2010.11929 (2021)."},{"key":"e_1_3_2_1_8_1","unstructured":"FriendliAI. 2024. FriendliAI. https:\/\/friendli.ai\/."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1168857.1168877"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/605397.605428"},{"key":"e_1_3_2_1_11_1","unstructured":"Graphcore. 2022. Bow Pod256. https:\/\/www.graphcore.ai\/products\/bow-pod256."},{"key":"e_1_3_2_1_12_1","unstructured":"Graphcore. 2022. Graphcore raises $222 millino in series E funding round. https:\/\/www.graphcore.ai\/posts\/graphcore-raises-222-million-in-series-e-funding-round."},{"key":"e_1_3_2_1_13_1","unstructured":"Graphcore. 2022. IPU Hardware Overview. https:\/\/docs.graphcore.ai\/projects\/ipu-programmers-guide\/en\/latest\/about_ipu.html."},{"key":"e_1_3_2_1_14_1","unstructured":"Graphcore. 2022. PopART User Guide. https:\/\/docs.graphcore.ai\/projects\/popart-user-guide\/en\/latest\/intro.html."},{"key":"e_1_3_2_1_15_1","unstructured":"Graphcore. 2022. Tile Vertex ISA. https:\/\/docs.graphcore.ai\/projects\/isa\/en\/latest\/_static\/Tile-Vertex-ISA_1.2.3.pdf."},{"key":"e_1_3_2_1_16_1","unstructured":"Graphcore. 2023. V-IPU User Guide. https:\/\/docs.graphcore.ai\/projects\/vipu-user\/en\/latest\/introduction.html."},{"key":"e_1_3_2_1_17_1","volume-title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces. arXiv preprint arXiv:2312.00752","author":"Gu Albert","year":"2023","unstructured":"Albert Gu and Tri Dao. 2023. Mamba: Linear-Time Sequence Modeling with Selective State Spaces. arXiv preprint arXiv:2312.00752 (2023)."},{"key":"e_1_3_2_1_18_1","volume-title":"Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.03385","author":"He Kaiming","year":"2015","unstructured":"Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun. 2015. Deep Residual Learning for Image Recognition. arXiv preprint arXiv:1512.03385 (2015)."},{"key":"e_1_3_2_1_19_1","volume-title":"Dissecting the Graphcore IPU Architecture via Microbenchmarking. arXiv preprint arXiv:1912.03413","author":"Jia Zhe","year":"2019","unstructured":"Zhe Jia, Blake Tillman, Marco Maggioni, and Daniele Paolo Scarpazza. 2019. Dissecting the Graphcore IPU Architecture via Microbenchmarking. arXiv preprint arXiv:1912.03413 (2019)."},{"key":"e_1_3_2_1_20_1","volume-title":"Beyond Data and Model Parallelism for Deep Neural Networks. arXiv preprint arXiv:1807.05358","author":"Jia Zhihao","year":"2018","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2018. Beyond Data and Model Parallelism for Deep Neural Networks. arXiv preprint arXiv:1807.05358 (2018)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_22_1","volume-title":"George Kurian, Sheng Li, Nishant Patil, James Laudon, Cliff Young, and David Patterson.","author":"Jouppi Norman P.","year":"2020","unstructured":"Norman P. Jouppi, Doe Hyun Yoon, George Kurian, Sheng Li, Nishant Patil, James Laudon, Cliff Young, and David Patterson. 2020. A Domain-Specific Supercomputer for Training Deep Neural Networks. Commun. ACM (2020)."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_24_1","volume-title":"2021 IEEE Hot Chips 33 Symposium (HCS '21)","author":"Knowles Simon","year":"2021","unstructured":"Simon Knowles. 2021. Graphcore Colossus Mk2 IPU. In 2021 IEEE Hot Chips 33 Symposium (HCS '21)."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192379"},{"key":"e_1_3_2_1_26_1","volume-title":"MAESTRO: A Data-Centric Approach to Understand Reuse, Performance, and Hardware Cost of DNN Mappings","author":"Kwon Hyoukjun","year":"2020","unstructured":"Hyoukjun Kwon, Prasanth Chatarasi, Vivek Sarkar, Tushar Krishna, Michael Pellauer, and Angshuman Parashar. 2020. MAESTRO: A Data-Centric Approach to Understand Reuse, Performance, and Hardware Cost of DNN Mappings. IEEE Micro (2020)."},{"key":"e_1_3_2_1_27_1","volume-title":"Multi-Wafer AI Cluster. In 2021 IEEE Hot Chips 33 Symposium (HCS '21)","author":"Lie Sean","year":"2021","unstructured":"Sean Lie. 2021. Multi-Million Core, Multi-Wafer AI Cluster. In 2021 IEEE Hot Chips 33 Symposium (HCS '21)."},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 48th Annual International Symposium on Computer Architecture (ISCA '21)","author":"Lu Liqiang","year":"2021","unstructured":"Liqiang Lu, Naiqing Guan, Yuyue Wang, Liancheng Jia, Zizhang Luo, Jieming Yin, Jason Cong, and Yun Liang. 2021. TENET: A Framework for Modeling Tensor Dataflow Based on Relation-Centric Notation. In Proceedings of the 48th Annual International Symposium on Computer Architecture (ISCA '21)."},{"key":"e_1_3_2_1_29_1","volume-title":"Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI '20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In Proceedings of the 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI '20)."},{"key":"e_1_3_2_1_30_1","volume-title":"NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM","author":"Mildenhall Ben","year":"2021","unstructured":"Ben Mildenhall, Pratul P. Srinivasan, Matthew Tancik, Jonathan T. Barron, Ravi Ramamoorthi, and Ren Ng. 2021. NeRF: representing scenes as neural radiance fields for view synthesis. Commun. ACM (2021)."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_32_1","unstructured":"Nvidia. 2020. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/resources.nvidia.com\/en-us-genomics-ep\/ampere-architecture-white-paper."},{"key":"e_1_3_2_1_33_1","unstructured":"NVIDIA. 2022. NVIDIA Hopper Architecture In-Depth. https:\/\/developer.nvidia.com\/blog\/nvidia-hopper-architecture-in-depth\/."},{"key":"e_1_3_2_1_34_1","unstructured":"NVIDIA. 2023. NVIDIA TensorRT. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_35_1","unstructured":"ONNX. 2023. ONNX. https:\/\/onnx.ai\/."},{"key":"e_1_3_2_1_36_1","volume-title":"Introducing Triton: Open-source GPU programming for neural networks. https:\/\/openai.com\/index\/triton\/.","author":"AI.","year":"2021","unstructured":"OpenAI. 2021. Introducing Triton: Open-source GPU programming for neural networks. https:\/\/openai.com\/index\/triton\/."},{"key":"e_1_3_2_1_37_1","unstructured":"Dylan Patel and Daniel Nishball. 2023. Groq Inference Tokenomics: Speed But At What Cost. https:\/\/www.semianalysis.com\/p\/groq-inference-tokenomics-speed-but."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Bo Peng Eric Alcaide Quentin Anthony Alon Albalak Samuel Arcadinho Huanqi Cao Xin Cheng Michael Chung Matteo Grella Kranthi Kiran GV Xuzheng He Haowen Hou Przemyslaw Kazienko Jan Kocon Jiaming Kong Bartlomiej Koptyra Hayden Lau Krishna Sri Ipsit Mantri Ferdinand Mom Atsushi Saito Xiangru Tang Bolun Wang Johan S. Wind Stansilaw Wozniak Ruichong Zhang Zhenyuan Zhang Qihang Zhao Peng Zhou Jian Zhu and Rui-Jie Zhu. 2023. RWKV: Reinventing RNNs for the Transformer Era. arXiv preprint arXiv:2305.13048 (2023).","DOI":"10.18653\/v1\/2023.findings-emnlp.936"},{"key":"e_1_3_2_1_39_1","volume-title":"2021 IEEE Hot Chips 33 Symposium (HCS '21)","author":"Prabhakar Raghu","year":"2021","unstructured":"Raghu Prabhakar and Sumti Jairath. 2021. SambaNova SN10 RDU: Accelerating Software 2.0 with Dataflow. In 2021 IEEE Hot Chips 33 Symposium (HCS '21)."},{"key":"e_1_3_2_1_40_1","volume-title":"SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts. arXiv preprint arXiv:2405.07518","author":"Prabhakar Raghu","year":"2024","unstructured":"Raghu Prabhakar, Ram Sivaramakrishnan, Darshan Gandhi, Yun Du, Mingran Wang, Xiangyu Song, Kejie Zhang, Tianren Gao, Angela Wang, Karen Li, Yongning Sheng, Joshua Brot, Denis Sokolov, Apurv Vivek, Calvin Leung, Arjun Sabnis, Jiayu Bai, Tuowen Zhao, Mark Gottscho, David Jackson, Mark Luttrell, Manish K. Shah, Edison Chen, Kaizhao Liang, Swayambhoo Jain, Urmish Thakker, Dawei Huang, Sumti Jairath, Kevin J. Brown, and Kunle Olukotun. 2024. SambaNova SN40L: Scaling the AI Memory Wall with Dataflow and Composition of Experts. arXiv preprint arXiv:2405.07518 (2024)."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080256"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2491956.2462176"},{"key":"e_1_3_2_1_43_1","volume-title":"Revet: A Language and Compiler for Dataflow Threads. arXiv preprint arXiv:2302.06124","author":"Rucker Alexander","year":"2024","unstructured":"Alexander Rucker, Shiv Sundram, Coleman Smith, Matthew Vilim, Raghu Prabhakar, Fredrik Kjolstad, and Kunle Olukotun. 2024. Revet: A Language and Compiler for Dataflow Threads. arXiv preprint arXiv:2302.06124 (2024)."},{"key":"e_1_3_2_1_44_1","unstructured":"SambaNova. 2021. SambaNova Systems Becomes World's Best-Funded AI Startup. https:\/\/sambanova.ai\/press\/series-d."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI '23)","author":"Shi Yining","year":"2023","unstructured":"Yining Shi, Zhi Yang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Ziming Miao, Yuxiao Guo, Fan Yang, and Lidong Zhou. 2023. Welder: Scheduling Deep Learning Memory Access via Tilegraph. In Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI '23)."},{"key":"e_1_3_2_1_46_1","volume-title":"Retentive Network: A Successor to Transformer for Large Language Models. arXiv preprint arXiv:2307.08621","author":"Sun Yutao","year":"2023","unstructured":"Yutao Sun, Li Dong, Shaohan Huang, Shuming Ma, Yuqing Xia, Jilong Xue, Jianyong Wang, and Furu Wei. 2023. Retentive Network: A Successor to Transformer for Large Language Models. arXiv preprint arXiv:2307.08621 (2023)."},{"key":"e_1_3_2_1_47_1","volume":"200","author":"Taylor M.B.","unstructured":"M.B. Taylor, J. Psota, A. Saraf, N. Shnidman, V. Strumpen, M. Frank, S. Amarasinghe, A. Agarwal, W. Lee, J. Miller, D. Wentzlaff, I. Bratt, B. Greenwald, H. Hoffmann, P. Johnson, and J. Kim. 2004. Evaluation of the Raw microprocessor: an exposed-wire-delay architecture for ILP and streams. In Proceedings of the 31st Annual International Symposium on Computer Architecture (ISCA '04).","journal-title":"J. Kim."},{"key":"e_1_3_2_1_48_1","unstructured":"TensorFlow. 2023. XLA. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_49_1","volume-title":"Meet Grayskull. https:\/\/tenstorrent.com\/grayskull\/.","year":"2023","unstructured":"Tenstorrent. 2023. Meet Grayskull. https:\/\/tenstorrent.com\/grayskull\/."},{"key":"e_1_3_2_1_50_1","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale Dan Bikel Lukas Blecher Cristian Canton Ferrer Moya Chen Guillem Cucurull David Esiobu Jude Fernandes Jeremy Fu Wenyin Fu Brian Fuller Cynthia Gao Vedanuj Goswami Naman Goyal Anthony Hartshorn Saghar Hosseini Rui Hou Hakan Inan Marcin Kardas Viktor Kerkez Madian Khabsa Isabel Kloumann Artem Korenev Punit Singh Koura Marie-Anne Lachaux Thibaut Lavril Jenya Lee Diana Liskovich Yinghai Lu Yuning Mao Xavier Martinet Todor Mihaylov Pushkar Mishra Igor Molybog Yixin Nie Andrew Poulton Jeremy Reizenstein Rashi Rungta Kalyan Saladi Alan Schelten Ruan Silva Eric Michael Smith Ranjan Subramanian Xiaoqing Ellen Tan Binh Tang Ross Taylor Adina Williams Jian Xiang Kuan Puxin Xu Zheng Yan Iliyan Zarov Yuchen Zhang Angela Fan Melanie Kambadur Sharan Narang Aurelien Rodriguez Robert Stojnic Sergey Edunov and Thomas Scialom. 2023. Llama 2: Open Foundation and Fine-Tuned Chat Models. arXiv preprint arXiv:2307.09288 (2023)."},{"key":"e_1_3_2_1_51_1","volume-title":"Tensor Comprehensions: Framework-Agnostic HighPerformance Machine Learning Abstractions. arXiv preprint arXiv:1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, Theodoros Theodoridis, Priya Goyal, Zachary DeVito, William S. Moses, Sven Verdoolaege, Andrew Adams, and Albert Cohen. 2018. Tensor Comprehensions: Framework-Agnostic HighPerformance Machine Learning Abstractions. arXiv preprint arXiv:1802.04730 (2018)."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3302424.3303953"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/1498765.1498785"},{"key":"e_1_3_2_1_54_1","volume-title":"GSPMD: General and Scalable Parallelization for ML Computation Graphs. arXiv preprint arXiv:2105.04663","author":"Xu Yuanzhong","year":"2021","unstructured":"Yuanzhong Xu, HyoukJoong Lee, Dehao Chen, Blake Hechtman, Yanping Huang, Rahul Joshi, Maxim Krikun, Dmitry Lepikhin, Andy Ly, Marcello Maggioni, Ruoming Pang, Noam Shazeer, Shibo Wang, Tao Wang, Yonghui Wu, and Zhifeng Chen. 2021. GSPMD: General and Scalable Parallelization for ML Computation Graphs. arXiv preprint arXiv:2105.04663 (2021)."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1145\/3519939.3523437"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)."},{"key":"e_1_3_2_1_57_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer.","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arXiv preprint arXiv:2205.01068 (2022)."},{"key":"e_1_3_2_1_58_1","volume-title":"Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI '23)","author":"Zhao Jie","year":"2023","unstructured":"Jie Zhao, Siyuan Feng, Xiaoqiang Dan, Fei Liu, Chengke Wang, Sheng Yuan, Wenyuan Lv, and Qikai Xie. 2023. Effectively Scheduling Computational Graphs of Deep Neural Networks toward Their Domain-Specific Accelerators. In Proceedings of the 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI '23)."},{"key":"e_1_3_2_1_59_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI '20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. 2020. Ansor: generating high-performance tensor programs for deep learning. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation (OSDI '20)."},{"key":"e_1_3_2_1_60_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating Inter-and Intra-Operator Parallelism for Distributed Deep Learning. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527440"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_63_1","volume-title":"Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In Proceedings of the 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI '22)."}],"event":{"name":"SOSP '24: ACM SIGOPS 30th Symposium on Operating Systems Principles","location":"Austin TX USA","acronym":"SOSP '24","sponsor":["SIGOPS ACM Special Interest Group on Operating Systems","USENIX"]},"container-title":["Proceedings of the ACM SIGOPS 30th Symposium on Operating Systems Principles"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695955","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694715.3695955","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:05:47Z","timestamp":1750291547000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694715.3695955"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,4]]},"references-count":63,"alternative-id":["10.1145\/3694715.3695955","10.1145\/3694715"],"URL":"https:\/\/doi.org\/10.1145\/3694715.3695955","relation":{},"subject":[],"published":{"date-parts":[[2024,11,4]]},"assertion":[{"value":"2024-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}