{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,12]],"date-time":"2026-03-12T01:05:05Z","timestamp":1773277505624,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":60,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,17]],"date-time":"2024-04-17T00:00:00Z","timestamp":1713312000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"National Key R&D Program of China","award":["2021ZD0110101"],"award-info":[{"award-number":["2021ZD0110101"]}]},{"name":"the National Natural Science Foundation of China","award":["T2222026"],"award-info":[{"award-number":["T2222026"]}]},{"name":"the National Natural Science Foundation of China","award":["22003073"],"award-info":[{"award-number":["22003073"]}]},{"name":"the National Natural Science Foundation of China","award":["62232015"],"award-info":[{"award-number":["62232015"]}]},{"name":"the National Natural Science Foundation of China","award":["62090024"],"award-info":[{"award-number":["62090024"]}]},{"name":"the Innovation Funding of ICT CAS","award":["E361010"],"award-info":[{"award-number":["E361010"]}]},{"name":"Beijing Nova Program"},{"name":"the UK Engineering and Physical Sciences Research Council","award":["EP\/X018202\/1"],"award-info":[{"award-number":["EP\/X018202\/1"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3617232.3624858","type":"proceedings-article","created":{"date-parts":[[2024,4,17]],"date-time":"2024-04-17T20:10:56Z","timestamp":1713384656000},"page":"286-301","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":12,"title":["Optimizing Deep Learning Inference via Global Analysis and Tensor Expressions"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0003-2014-5453","authenticated-orcid":false,"given":"Chunwei","family":"Xia","sequence":"first","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computer Science and Technology, University of Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computing, University of Leeds, Leeds, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5228-8972","authenticated-orcid":false,"given":"Jiacheng","family":"Zhao","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computer Science and Technology, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-8939-7721","authenticated-orcid":false,"given":"Qianqi","family":"Sun","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computer Science and Technology, University of Chinese Academy of Sciences, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6157-0662","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"additional","affiliation":[{"name":"School of Computing, University of Leeds, Leeds, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6747-947X","authenticated-orcid":false,"given":"Yuan","family":"Wen","sequence":"additional","affiliation":[{"name":"University of Aberdeen, Aberdeen, United Kingdom"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4391-8295","authenticated-orcid":false,"given":"Teng","family":"Yu","sequence":"additional","affiliation":[{"name":"Thewake Systems Ltd, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2909-7750","authenticated-orcid":false,"given":"Xiaobing","family":"Feng","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computer Science and Technology, University of Chinese Academy of Sciences, Beijing, China"},{"name":"Zhongguancun Laboratory, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2491-7679","authenticated-orcid":false,"given":"Huimin","family":"Cui","sequence":"additional","affiliation":[{"name":"State Key Lab of Processors, Institute of Computing Technology, Chinese Academy of Sciences, Beijing, China"},{"name":"School of Computer Science and Technology, University of Chinese Academy of Sciences, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,4,17]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"[n. d.]. IREE: Intermediate Representation Execution Environment. https:\/\/github.com\/iree-org\/iree."},{"key":"e_1_3_2_1_2_1","volume-title":"TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16)","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, et al. 2016. TensorFlow: A System for Large-Scale Machine Learning. In 12th USENIX Symposium on Operating Systems Design and Implementation (OSDI 16). USENIX Association, 265--283. https:\/\/www.usenix.org\/conference\/osdi16\/technical-sessions\/presentation\/abadi"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3192366.3192401"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2858788.2688521"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO.2019.8661197"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/2212908.2212917"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","unstructured":"U. Bondhugula A. Acharya and A Cohen. 2016. The Pluto+ Algorithm: A Practical Approach for Parallelization and Locality Optimization of Affine Loop Nests. In ACM Transactions on Programming Languages and Systems.","DOI":"10.1145\/2896389"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/1854273.1854317"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/1375581.1375595"},{"key":"e_1_3_2_1_10_1","unstructured":"Tom Brown Benjamin Mann Nick Ryder Melanie Subbiah Jared D Kaplan Prafulla Dhariwal Arvind Neelakantan Pranav Shyam Girish Sastry et al. 2020. Language models are few-shot learners. Advances in neural information processing systems 33 (2020) 1877--1901."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.drudis.2018.01.039"},{"key":"e_1_3_2_1_12_1","volume-title":"TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18)","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: An Automated End-to-End Optimizing Compiler for Deep Learning. In 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18). USENIX Association, 578--594. https:\/\/www.usenix.org\/conference\/osdi18\/presentation\/chen"},{"key":"e_1_3_2_1_13_1","unstructured":"ONNX Runtime developers. 2021. ONNX Runtime. https:\/\/onnxruntime.ai\/. Version: x.y.z."},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3576933"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467286"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579990.3580017"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454038"},{"key":"e_1_3_2_1_23_1","volume-title":"International Workshop on Languages and Compilers for Parallel Computing. Springer, 301--320","author":"Kennedy Ken","year":"1993","unstructured":"Ken Kennedy and Kathryn S McKinley. 1993. Maximizing loop parallelism and improving data locality via loop fusion and distribution. In International Workshop on Languages and Compilers for Parallel Computing. Springer, 301--320."},{"key":"e_1_3_2_1_24_1","volume-title":"Kingma and Jimmy Ba","author":"Diederik","year":"2015","unstructured":"Diederik P. Kingma and Jimmy Ba. 2015. Adam: A Method for Stochastic Optimization. In 3rd International Conference on Learning Representations, ICLR 2015, San Diego, CA, USA, May 7--9, 2015, Conference Track Proceedings, Yoshua Bengio and Yann LeCun (Eds.). http:\/\/arxiv.org\/abs\/1412.6980"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/ASE.2017.8115709"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-29859-3_8"},{"key":"e_1_3_2_1_27_1","volume-title":"XLA: TensorFlow, compiled. Tesor-Flow Dev Summit.","author":"Leary Chris","year":"2017","unstructured":"Chris Leary and Todd Wang. 2017. XLA: TensorFlow, compiled. Tesor-Flow Dev Summit."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173191"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_2"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3178068"},{"key":"e_1_3_2_1_31_1","volume-title":"Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. CoRR abs\/2103.14030","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. 2021. Swin Transformer: Hierarchical Vision Transformer using Shifted Windows. CoRR abs\/2103.14030 (2021). arXiv:2103.14030 https:\/\/arxiv.org\/abs\/2103.14030"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3220007"},{"key":"e_1_3_2_1_33_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Ma Lingxiao","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. [n. d.]. Rammer: Enabling Holistic Deep Learning Compiler Optimizations with rTasks. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX Association, 881--897. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/ma"},{"key":"e_1_3_2_1_34_1","unstructured":"Microsoft. 2022. Antares. https:\/\/github.com\/microsoft\/antares\/tree\/latest."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/2897824.2925952"},{"key":"e_1_3_2_1_36_1","unstructured":"Multi-Level IR Compiler Framework committee. 2022. 'affine' Dialect."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"e_1_3_2_1_38_1","unstructured":"NVIDIA Corporation. 2021. TensorRT. https:\/\/developer.nvidia.com\/tensorrt."},{"key":"e_1_3_2_1_39_1","unstructured":"NVIDIA Corporation. 2022. NVIDIA Nsight Compute."},{"key":"e_1_3_2_1_40_1","unstructured":"NVIDIA Corporation. 2023. CUDA Grid Synchronization. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/#grid-synchronization."},{"key":"e_1_3_2_1_41_1","unstructured":"Adam Paszke Sam Gross Francisco Massa Adam Lerer et al. 2019. PyTorch: An Imperative Style High-Performance Deep Learning Library. CoRR abs\/1912.01703 (2019). arXiv:1912.01703 http:\/\/arxiv.org\/abs\/1912.01703"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.5555\/3314872.3314901"},{"key":"e_1_3_2_1_43_1","volume-title":"Mlir: Multi-level intermediate representation for compiler infrastructure.","author":"Shpeisman Tatiana","year":"2019","unstructured":"Tatiana Shpeisman and Chris Lattner. 2019. Mlir: Multi-level intermediate representation for compiler infrastructure."},{"key":"e_1_3_2_1_44_1","volume-title":"Proceedings of the 27th International Conference on Neural Information Processing Systems -","volume":"2","author":"Sutskever Ilya","unstructured":"Ilya Sutskever, Oriol Vinyals, and Quoc V. Le. [n. d.]. Sequence to Sequence Learning with Neural Networks. In Proceedings of the 27th International Conference on Neural Information Processing Systems - Volume 2 (Montreal, Canada) (NIPS'14). MIT Press, 3104--3112."},{"key":"e_1_3_2_1_45_1","volume-title":"Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence. 4278--4284","author":"Szegedy Christian","unstructured":"Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, and Alexander A. Alemi. 2017. Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning. In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence. 4278--4284."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3433103"},{"key":"e_1_3_2_1_47_1","unstructured":"Tianqi Chen. 2022. Working with Operators Using Tensor Expression. https:\/\/tvm.apache.org\/docs\/tutorial\/tensor_expr_get_started.html."},{"key":"e_1_3_2_1_48_1","volume-title":"Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR abs\/1802.04730","author":"Vasilache Nicolas","year":"2018","unstructured":"Nicolas Vasilache, Oleksandr Zinenko, et al. 2018. Tensor Comprehensions: Framework-Agnostic High-Performance Machine Learning Abstractions. CoRR abs\/1802.04730 (2018). arXiv:1802.04730 http:\/\/arxiv.org\/abs\/1802.04730"},{"key":"e_1_3_2_1_49_1","volume-title":"CoRR abs\/1706.03762","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. 2017. Attention Is All You Need. CoRR abs\/1706.03762 (2017). arXiv:1706.03762 http:\/\/arxiv.org\/abs\/1706.03762"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC.2014.21"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3497776.3517769"},{"key":"e_1_3_2_1_52_1","volume-title":"Proceedings of Machine Learning and Systems 2021","author":"Wang Shang","year":"2021","unstructured":"Shang Wang, Peiming Yang, Yuxuan Zheng, Xin Li, and Gennady Pekhimenko. 2021. Horizontally Fused Training Array: An Effective Hardware Utilization Squeezer for Training Novel Deep Learning Models. In Proceedings of Machine Learning and Systems 2021. mlsys.org. https:\/\/proceedings.mlsys.org\/paper\/2021\/hash\/a97da629b098b75c294dffdc3e463904-Abstract.html"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2018.2817118"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPSW.2012.300"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of Machine Learning and Systems 2022","author":"Zhao Jie","year":"2022","unstructured":"Jie Zhao, Xiong Gao, et al. 2022. Apollo: Automatic Partition-based Operator Fusion through Layer by Layer Optimization. In Proceedings of Machine Learning and Systems 2022. https:\/\/proceedings.mlsys.org\/paper\/2022\/hash\/069059b7ef840f0c74a814ec9237b6ec-Abstract.html"},{"key":"e_1_3_2_1_57_1","volume-title":"Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, et al. [n. d.]. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, 2020. 863--879. https:\/\/www.usenix.org\/conference\/osdi20\/presentation\/zheng"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","unstructured":"Guorui Zhou Na Mou Ying Fan Qi Pi Weijie Bian Chang Zhou Xiaoqiang Zhu and Kun Gai. 2018. Deep Interest Evolution Network for Click-Through Rate Prediction. 10.48550\/ARXIV.1809.03672","DOI":"10.48550\/ARXIV.1809.03672"},{"key":"e_1_3_2_1_60_1","volume-title":"ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, et al. 2022. ROLLER: Fast and Efficient Tensor Compilation for Deep Learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, 233--248. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/zhu"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 1"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617232.3624858","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3617232.3624858","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:46:13Z","timestamp":1750178773000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3617232.3624858"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,17]]},"references-count":60,"alternative-id":["10.1145\/3617232.3624858","10.1145\/3617232"],"URL":"https:\/\/doi.org\/10.1145\/3617232.3624858","relation":{},"subject":[],"published":{"date-parts":[[2024,4,17]]},"assertion":[{"value":"2024-04-17","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}