{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T12:27:47Z","timestamp":1773318467910,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":70,"publisher":"ACM","funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2017, 62222210"],"award-info":[{"award-number":["U21B2017, 62222210"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Shenzhen-HongKong Joint Funding Project","award":["SGDX20230116092056010"],"award-info":[{"award-number":["SGDX20230116092056010"]}]},{"name":"Shenzhen Key Laboratory of Intelligent Bioinformatics","award":["ZDSYS20220422103800001"],"award-info":[{"award-number":["ZDSYS20220422103800001"]}]},{"name":"Minister of Education, Singapore","award":["MOE-T2EP20124-0017, MOET32020-0004"],"award-info":[{"award-number":["MOE-T2EP20124-0017, MOET32020-0004"]}]},{"DOI":"10.13039\/501100001381","name":"National Research Foundation Singapore","doi-asserted-by":"publisher","award":["NCRP25-P04-TAICeN, A-8002767-00-00"],"award-info":[{"award-number":["NCRP25-P04-TAICeN, A-8002767-00-00"]}],"id":[{"id":"10.13039\/501100001381","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001445","name":"DSO National Laboratories - Singapore","doi-asserted-by":"publisher","award":["AISG2-GC-2023-008-1B"],"award-info":[{"award-number":["AISG2-GC-2023-008-1B"]}],"id":[{"id":"10.13039\/501100001445","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,11,16]]},"DOI":"10.1145\/3712285.3759779","type":"proceedings-article","created":{"date-parts":[[2025,11,12]],"date-time":"2025-11-12T16:04:47Z","timestamp":1762963487000},"page":"167-184","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["A Sample-Free Compilation Framework for Efficient Dynamic Tensor Computation"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3652-5437","authenticated-orcid":false,"given":"Yangjie","family":"Zhou","sequence":"first","affiliation":[{"name":"Tencent, Shenzhen, China and National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-1703-1691","authenticated-orcid":false,"given":"Honglin","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5210-8166","authenticated-orcid":false,"given":"Qian","family":"Qiu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6646-5260","authenticated-orcid":false,"given":"Weihao","family":"Cui","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0874-0682","authenticated-orcid":false,"given":"Zihan","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China and Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1244-3151","authenticated-orcid":false,"given":"Peng","family":"Chen","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7165-2095","authenticated-orcid":false,"given":"Mohamed","family":"Wahib","sequence":"additional","affiliation":[{"name":"RIKEN Center for Computational Science, Kobe, Japan"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4479-5525","authenticated-orcid":false,"given":"Cong","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4682-983X","authenticated-orcid":false,"given":"Siyuan","family":"Feng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6208-4102","authenticated-orcid":false,"given":"Jintao","family":"Meng","sequence":"additional","affiliation":[{"name":"Shenzhen Institute of Advanced Technology, Chinese Academy of Sciences, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5231-4015","authenticated-orcid":false,"given":"Haidong","family":"Lan","sequence":"additional","affiliation":[{"name":"Taichi Graphics, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5660-5493","authenticated-orcid":false,"given":"Jingwen","family":"Leng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China and Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8255-0118","authenticated-orcid":false,"given":"Yun","family":"Lin","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6512-8326","authenticated-orcid":false,"given":"Jin Song","family":"Dong","sequence":"additional","affiliation":[{"name":"National University of Singapore, Singapore, Singapore"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-4933-4672","authenticated-orcid":false,"given":"Wenxi","family":"Zhu","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0005-8122-3023","authenticated-orcid":false,"given":"Minwen","family":"Deng","sequence":"additional","affiliation":[{"name":"Tencent, Shenzhen, China"}]}],"member":"320","published-online":{"date-parts":[[2025,11,15]]},"reference":[{"key":"e_1_3_3_2_2_2","unstructured":"2020. oneAPI Deep Neural Network Library (oneDNN). https:\/\/github.com\/oneapi-src\/oneDNN."},{"key":"e_1_3_3_2_3_2","first-page":"265","volume-title":"12th USENIX symposium on operating systems design and implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, et\u00a0al. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX symposium on operating systems design and implementation (OSDI 16). 265\u2013283."},{"key":"e_1_3_3_2_4_2","unstructured":"ARM. 2017. ARM Compute Library. https:\/\/github.com\/ARM-software\/ComputeLibrary\/"},{"key":"e_1_3_3_2_5_2","unstructured":"Sangmin Bae Jongwoo Ko Hwanjun Song and Se-Young Yun. 2023. Fast and robust early-exiting framework for autoregressive language models with synchronized parallel decoding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2310.05424 (2023)."},{"key":"e_1_3_3_2_6_2","unstructured":"James Bradbury Roy Frostig Peter Hawkins Matthew\u00a0James Johnson Chris Leary Dougal Maclaurin George Necula Adam Paszke Jake VanderPlas Skye Wanderman-Milne et\u00a0al. 2018. JAX: composable transformations of Python+ NumPy programs. Version 0.2 5 (2018) 14\u201324."},{"key":"e_1_3_3_2_7_2","doi-asserted-by":"publisher","DOI":"10.1145\/3295500.3356162"},{"key":"e_1_3_3_2_8_2","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, et\u00a0al. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). 578\u2013594."},{"key":"e_1_3_3_2_9_2","unstructured":"Tianqi Chen Lianmin Zheng Eddie Yan Ziheng Jiang Thierry Moreau Luis Ceze Carlos Guestrin and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_3_2_10_2","unstructured":"Sharan Chetlur Cliff Woolley Philippe Vandermersch Jonathan Cohen John Tran Bryan Catanzaro and Evan Shelhamer. 2014. cudnn: Efficient primitives for deep learning. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1410.0759 (2014)."},{"key":"e_1_3_3_2_11_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_3_2_12_2","unstructured":"George Chrysos. 2014. Intel\u00ae xeon phi\u2122 coprocessor-the architecture. Intel Whitepaper 176 2014 (2014) 43\u201350."},{"key":"e_1_3_3_2_13_2","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511615993"},{"key":"e_1_3_3_2_14_2","first-page":"183","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Cui Weihao","year":"2022","unstructured":"Weihao Cui, Han Zhao, Quan Chen, Hao Wei, Zirui Li, Deze Zeng, Chao Li, and Minyi Guo. 2022. DVABatch: Diversity-aware Multi-EntryMulti-Exit Batching for Efficient Processing of DNN Services on GPUs. In 2022 USENIX Annual Technical Conference (USENIX ATC 22). 183\u2013198."},{"key":"e_1_3_3_2_15_2","unstructured":"ONNX\u00a0Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. Version: x.y.z."},{"key":"e_1_3_3_2_16_2","unstructured":"Jacob Devlin Ming-Wei Chang Kenton Lee and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1810.04805 (2018)."},{"key":"e_1_3_3_2_17_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_3_2_18_2","unstructured":"Matthias Fey and Jan\u00a0Eric Lenssen. 2019. Fast graph representation learning with PyTorch Geometric. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1903.02428 (2019)."},{"key":"e_1_3_3_2_19_2","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.169"},{"key":"e_1_3_3_2_20_2","doi-asserted-by":"crossref","unstructured":"Kazushige Goto and Robert A van\u00a0de Geijn. 2008. Anatomy of high-performance matrix multiplication. ACM Transactions on Mathematical Software (TOMS) 34 3 (2008) 1\u201325.","DOI":"10.1145\/1356052.1356053"},{"key":"e_1_3_3_2_21_2","volume-title":"Sieve methods","author":"Halberstam Heine","year":"2013","unstructured":"Heine Halberstam and Hans\u00a0Egon Richert. 2013. Sieve methods. Courier Corporation."},{"key":"e_1_3_3_2_22_2","doi-asserted-by":"crossref","unstructured":"Yizeng Han Gao Huang Shiji Song Le Yang Honghui Wang and Yulin Wang. 2021. Dynamic neural networks: A survey. IEEE Transactions on Pattern Analysis and Machine Intelligence 44 11 (2021) 7436\u20137456.","DOI":"10.1109\/TPAMI.2021.3117837"},{"key":"e_1_3_3_2_23_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.207"},{"key":"e_1_3_3_2_24_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_3_2_25_2","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_3_2_26_2","unstructured":"Thomas\u00a0N Kipf and Max Welling. 2016. Semi-supervised classification with graph convolutional networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1609.02907 (2016)."},{"key":"e_1_3_3_2_27_2","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E Hinton. 2012. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems 25 (2012)."},{"key":"e_1_3_3_2_28_2","doi-asserted-by":"crossref","unstructured":"Yann LeCun Yoshua Bengio and Geoffrey Hinton. 2015. Deep learning. nature 521 7553 (2015) 436\u2013444.","DOI":"10.1038\/nature14539"},{"key":"e_1_3_3_2_29_2","first-page":"881","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. 2020. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). 881\u2013897."},{"key":"e_1_3_3_2_30_2","doi-asserted-by":"crossref","unstructured":"Pengyu Mu Yi Liu Rui Wang Guoxiang Liu Zhonghao Sun Hailong Yang Zhongzhi Luan and Depei Qian. 2023. Haotuner: A hardware adaptive operator auto-tuner for dynamic shape tensor compilers. IEEE Trans. Comput. 72 11 (2023) 3178\u20133190.","DOI":"10.1109\/TC.2023.3288758"},{"key":"e_1_3_3_2_31_2","unstructured":"S Narang and G Diamos. 2017. DeepBench: Benchmarking deep learning operations on different hardware."},{"key":"e_1_3_3_2_32_2","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_3_2_33_2","unstructured":"NVIDIA. 2021. NVIDIA A100 Tensor Core GPU Architecture. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_3_2_34_2","volume-title":"NVIDIA cuBLAS Documentation","author":"Corporation NVIDIA","year":"2025","unstructured":"NVIDIA Corporation. 2025. NVIDIA cuBLAS Documentation. https:\/\/docs.nvidia.com\/cuda\/cublas\/index.html"},{"key":"e_1_3_3_2_35_2","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer James Bradbury Gregory Chanan Trevor Killeen Zeming Lin Natalia Gimelshein Luca Antiga et\u00a0al. 2019. Pytorch: An imperative style high-performance deep learning library. Advances in neural information processing systems 32 (2019)."},{"key":"e_1_3_3_2_36_2","doi-asserted-by":"crossref","unstructured":"Andrea Pellegrini Nigel Stephens Magnus Bruce Yasuo Ishii Joseph Pusdesris Abhishek Raja Chris Abernathy Jinson Koppanalil Tushar Ringe Ashok Tummala et\u00a0al. 2020. The arm neoverse n1 platform: Building blocks for the next-gen cloud-to-edge infrastructure soc. IEEE Micro 40 2 (2020) 53\u201362.","DOI":"10.1109\/MM.2020.2972222"},{"key":"e_1_3_3_2_37_2","unstructured":"PyTorch Contributors. 2022. TorchDynamo. https:\/\/pytorch.org\/docs\/master\/dynamo\/."},{"key":"e_1_3_3_2_38_2","unstructured":"Alec Radford Jeffrey Wu Rewon Child David Luan Dario Amodei Ilya Sutskever et\u00a0al. 2019. Language models are unsupervised multitask learners. OpenAI blog 1 8 (2019) 9."},{"key":"e_1_3_3_2_39_2","doi-asserted-by":"crossref","unstructured":"Jonathan Ragan-Kelley Connelly Barnes Andrew Adams Sylvain Paris Fr\u00e9do Durand and Saman Amarasinghe. 2013. Halide: a language and compiler for optimizing parallelism locality and recomputation in image processing pipelines. Acm Sigplan Notices 48 6 (2013) 519\u2013530.","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_3_2_40_2","doi-asserted-by":"crossref","unstructured":"Haseena Rahmath\u00a0P Vishal Srivastava Kuldeep Chaurasia Roberto\u00a0G Pacheco and Rodrigo\u00a0S Couto. 2024. Early-exit deep neural network-a comprehensive survey. Comput. Surveys 57 3 (2024) 1\u201337.","DOI":"10.1145\/3698767"},{"key":"e_1_3_3_2_41_2","unstructured":"Amit Sabne. 2020. XLA : Compiling Machine Learning for Peak Performance."},{"key":"e_1_3_3_2_42_2","unstructured":"Haichen Shen Jared Roesch Zhi Chen Wei Chen Yong Wu Mu Li Vin Sharma Zachary Tatlock and Yida Wang. 2021. Nimble: Efficiently compiling dynamic neural networks for model inference. Proceedings of Machine Learning and Systems 3 (2021) 208\u2013222."},{"key":"e_1_3_3_2_43_2","unstructured":"Karen Simonyan and Andrew Zisserman. 2014. Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1409.1556 (2014)."},{"key":"e_1_3_3_2_44_2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"e_1_3_3_2_45_2","unstructured":"Sharegpt teams. 2023. Sharegot. https:\/\/sharegpt.com\/"},{"key":"e_1_3_3_2_46_2","unstructured":"Vijay Thakkar Pradeep Ramani Cris Cecka Aniket Shivam Honghao Lu Ethan Yan Jack Kosaian Mark Hoemmen Haicheng Wu Andrew Kerr Matt Nicely Duane Merrill Dustyn Blasig Fengqi Qiao Piotr Majcher Paul Springer Markus Hohnerbach Jin Wang and Manish Gupta. 2023. CUTLASS. https:\/\/github.com\/NVIDIA\/cutlass"},{"key":"e_1_3_3_2_47_2","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar et\u00a0al. 2023. Llama: Open and efficient foundation language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2302.13971 (2023)."},{"key":"e_1_3_3_2_48_2","unstructured":"Hugo Touvron Louis Martin Kevin Stone Peter Albert Amjad Almahairi Yasmine Babaei Nikolay Bashlykov Soumya Batra Prajjwal Bhargava Shruti Bhosale et\u00a0al. 2023. Llama 2: Open foundation and fine-tuned chat models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2307.09288 (2023)."},{"key":"e_1_3_3_2_49_2","first-page":"267","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Unger Colin","year":"2022","unstructured":"Colin Unger, Zhihao Jia, Wei Wu, Sina Lin, Mandeep Baines, Carlos Efrain\u00a0Quintero Narvaez, Vinay Ramakrishnaiah, Nirmal Prajapati, Pat McCormick, Jamaludin Mohd-Yusof, et\u00a0al. 2022. Unity: Accelerating DNN training through joint optimization of algebraic transformations and parallelization. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 267\u2013284."},{"key":"e_1_3_3_2_50_2","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez \u0141ukasz Kaiser and Illia Polosukhin. 2017. Attention is all you need. Advances in neural information processing systems 30 (2017)."},{"key":"e_1_3_3_2_51_2","unstructured":"Petar Veli\u010dkovi\u0107 Guillem Cucurull Arantxa Casanova Adriana Romero Pietro Lio and Yoshua Bengio. 2017. Graph attention networks. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/1710.10903 (2017)."},{"key":"e_1_3_3_2_52_2","doi-asserted-by":"crossref","unstructured":"Endong Wang Qing Zhang Bo Shen Guangyong Zhang Xiaowei Lu Qing Wu Yajuan Wang Endong Wang Qing Zhang Bo Shen et\u00a0al. 2014. Intel math kernel library. High-Performance Computing on the Intel\u00ae Xeon Phi\u2122: How to Fully Exploit MIC Architectures (2014) 167\u2013188.","DOI":"10.1007\/978-3-319-06486-4_7"},{"key":"e_1_3_3_2_53_2","first-page":"515","volume-title":"15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21)","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. 2021. GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation (OSDI 21). 515\u2013531."},{"key":"e_1_3_3_2_54_2","doi-asserted-by":"publisher","DOI":"10.1145\/3447786.3456247"},{"key":"e_1_3_3_2_55_2","doi-asserted-by":"crossref","unstructured":"Zonghan Wu Shirui Pan Fengwen Chen Guodong Long Chengqi Zhang and S\u00a0Yu Philip. 2020. A comprehensive survey on graph neural networks. IEEE transactions on neural networks and learning systems 32 1 (2020) 4\u201324.","DOI":"10.1109\/TNNLS.2020.2978386"},{"key":"e_1_3_3_2_56_2","unstructured":"Zhiqiang Xie Minjie Wang Zihao Ye Zheng Zhang and Rui Fan. 2022. Graphiler: Optimizing Graph Neural Networks with Message Passing Data Flow Graph. Proceedings of Machine Learning and Systems 4 (2022) 515\u2013528."},{"key":"e_1_3_3_2_57_2","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.204"},{"key":"e_1_3_3_2_58_2","unstructured":"Yichen Yang Phitchaya Phothilimthana Yisu Wang Max Willsey Sudip Roy and Jacques Pienaar. 2021. Equality saturation for tensor graph superoptimization. Proceedings of Machine Learning and Systems 3 (2021) 255\u2013268."},{"key":"e_1_3_3_2_59_2","doi-asserted-by":"publisher","DOI":"10.1117\/12.2537799"},{"key":"e_1_3_3_2_60_2","doi-asserted-by":"publisher","DOI":"10.1145\/3620665.3640390"},{"key":"e_1_3_3_2_61_2","first-page":"521","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A distributed serving system for Transformer-Based generative models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 521\u2013538."},{"key":"e_1_3_3_2_62_2","unstructured":"Wayne\u00a0Xin Zhao Kun Zhou Junyi Li Tianyi Tang Xiaolei Wang Yupeng Hou Yingqian Min Beichen Zhang Junjie Zhang Zican Dong et\u00a0al. 2023. A survey of large language models. arXiv preprint arXiv:https:\/\/arXiv.org\/abs\/2303.18223 (2023)."},{"key":"e_1_3_3_2_63_2","unstructured":"Bojian Zheng Ziheng Jiang Cody\u00a0Hao Yu Haichen Shen Joshua Fromm Yizhi Liu Yida Wang Luis Ceze Tianqi Chen and Gennady Pekhimenko. 2022. DietCode: Automatic optimization for dynamic tensor programs. Proceedings of Machine Learning and Systems 4 (2022) 848\u2013863."},{"key":"e_1_3_3_2_64_2","first-page":"863","volume-title":"14th USENIX symposium on operating systems design and implementation (OSDI 20)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody\u00a0Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, et\u00a0al. 2020. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX symposium on operating systems design and implementation (OSDI 20). 863\u2013879."},{"key":"e_1_3_3_2_65_2","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071018"},{"key":"e_1_3_3_2_66_2","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378508"},{"key":"e_1_3_3_2_67_2","doi-asserted-by":"crossref","unstructured":"Zhen Zheng Zaifeng Pan Dalin Wang Kai Zhu Wenyi Zhao Tianyou Guo Xiafei Qiu Minmin Sun Junjie Bai Feng Zhang et\u00a0al. 2023. BladeDISC: Optimizing Dynamic Shape Machine Learning Workloads via Compiler Approach. Proceedings of the ACM on Management of Data 1 3 (2023) 1\u201329.","DOI":"10.1145\/3617327"},{"key":"e_1_3_3_2_68_2","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_3_2_69_2","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575723"},{"key":"e_1_3_3_2_70_2","doi-asserted-by":"publisher","DOI":"10.1145\/3587135.3592199"},{"key":"e_1_3_3_2_71_2","first-page":"233","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, et\u00a0al. 2022. ROLLER: Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 233\u2013248."}],"event":{"name":"SC '25: The International Conference for High Performance Computing, Networking, Storage and Analysis","location":"St. Louis MO USA","acronym":"SC '25","sponsor":["SIGHPC ACM Special Interest Group on High Performance Computing, Special Interest Group on High Performance Computing"]},"container-title":["Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3712285.3759779","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T18:42:46Z","timestamp":1773254566000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3712285.3759779"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,11,15]]},"references-count":70,"alternative-id":["10.1145\/3712285.3759779","10.1145\/3712285"],"URL":"https:\/\/doi.org\/10.1145\/3712285.3759779","relation":{},"subject":[],"published":{"date-parts":[[2025,11,15]]},"assertion":[{"value":"2025-11-15","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}