{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,10]],"date-time":"2026-04-10T09:58:55Z","timestamp":1775815135793,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":83,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,3,25]],"date-time":"2023-03-25T00:00:00Z","timestamp":1679702400000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62322213"],"award-info":[{"award-number":["62322213"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["6217241"],"award-info":[{"award-number":["6217241"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,3,25]]},"DOI":"10.1145\/3623278.3624761","type":"proceedings-article","created":{"date-parts":[[2024,2,7]],"date-time":"2024-02-07T19:28:26Z","timestamp":1707334106000},"page":"268-286","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":6,"title":["RECom: A Compiler Approach to Accelerating Recommendation Model Inference with Massive Embedding Columns"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6759-2616","authenticated-orcid":false,"given":"Zaifeng","family":"Pan","sequence":"first","affiliation":[{"name":"Renmin University of China, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-2692-713X","authenticated-orcid":false,"given":"Zhen","family":"Zheng","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1983-7321","authenticated-orcid":false,"given":"Feng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6826-8108","authenticated-orcid":false,"given":"Ruofan","family":"Wu","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8097-4707","authenticated-orcid":false,"given":"Hao","family":"Liang","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-9016-8446","authenticated-orcid":false,"given":"Dalin","family":"Wang","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-8803-928X","authenticated-orcid":false,"given":"Xiafei","family":"Qiu","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-6805-4785","authenticated-orcid":false,"given":"Junjie","family":"Bai","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3003-0150","authenticated-orcid":false,"given":"Wei","family":"Lin","sequence":"additional","affiliation":[{"name":"Alibaba Group, Hangzhou, Zhejiang, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5757-9135","authenticated-orcid":false,"given":"Xiaoyong","family":"Du","sequence":"additional","affiliation":[{"name":"Renmin University of China, Beijing, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,2,7]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"265","volume-title":"12th USENIX OSDI 2016","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Paul Barham, Jianmin Chen, Zhifeng Chen, Andy Davis, Jeffrey Dean, Matthieu Devin, Sanjay Ghemawat, Geoffrey Irving, Michael Isard, Manjunath Kudlur, Josh Levenberg, Rajat Monga, Sherry Moore, Derek Gordon Murray, Benoit Steiner, Paul A. Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng. Tensorflow: A system for large-scale machine learning. In Kimberly Keeton and Timothy Roscoe, editors, 12th USENIX OSDI 2016, pages 265--283. USENIX Association, 2016."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.14778\/3485450.3485462"},{"key":"e_1_3_2_1_3_1","volume-title":"https:\/\/github.com\/alibaba\/DeepRec","year":"2023","unstructured":"Alibaba. Alibaba\/DeepRec. https:\/\/github.com\/alibaba\/DeepRec, 2023."},{"key":"e_1_3_2_1_4_1","volume-title":"End-to-end user behavior retrieval in click-through rateprediction model. CoRR, abs\/2108.04468","author":"Chen Qiwei","year":"2021","unstructured":"Qiwei Chen, Changhua Pei, Shanshan Lv, Chao Li, Junfeng Ge, and Wenwu Ou. End-to-end user behavior retrieval in click-through rateprediction model. CoRR, abs\/2108.04468, 2021."},{"key":"e_1_3_2_1_5_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. TVM: an automated end-to-end optimizing compiler for deep learning. In Andrea C. Arpaci-Dusseau and Geoff Voelker, editors, 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8--10, 2018, pages 578--594. USENIX Association, 2018."},{"key":"e_1_3_2_1_6_1","first-page":"3393","volume-title":"Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Q. Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. Learning to optimize tensor programs. In Samy Bengio, Hanna M. Wallach, Hugo Larochelle, Kristen Grauman, Nicol\u00f2 Cesa-Bianchi, and Roman Garnett, editors, Advances in Neural Information Processing Systems 31: Annual Conference on Neural Information Processing Systems 2018, NeurIPS 2018, December 3--8, 2018, Montr\u00e9al, Canada, pages 3393--3404, 2018."},{"issue":"1","key":"e_1_3_2_1_7_1","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1145\/3588684","article-title":"Efficient parallel graph analytics with rule-based compression","volume":"1","author":"Chen Zheng","year":"2023","unstructured":"Zheng Chen, Feng Zhang, JiaWei Guan, Jidong Zhai, Xipeng Shen, Huanchen Zhang, Wentong Shu, and Xiaoyong Du. Compressgraph: Efficient parallel graph analytics with rule-based compression. Proceedings of the ACM on Management of Data, 1(1):1--31, 2023.","journal-title":"Proceedings of the ACM on Management of Data"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_9_1","volume-title":"Easyrec: An easy-to-use, extendable and efficient framework for building industrial recommendation systems. CoRR, abs\/2209.12766","author":"Cheng Mengli","year":"2022","unstructured":"Mengli Cheng, Yue Gao, Guoqiang Liu, Hongsheng Jin, and Xiaowen Zhang. Easyrec: An easy-to-use, extendable and efficient framework for building industrial recommendation systems. CoRR, abs\/2209.12766, 2022."},{"key":"e_1_3_2_1_10_1","volume-title":"Dynamic sparse tensor algebra compilation. CoRR, abs\/2112.01394","author":"Chou Stephen","year":"2021","unstructured":"Stephen Chou and Saman P. Amarasinghe. Dynamic sparse tensor algebra compilation. CoRR, abs\/2112.01394, 2021."},{"key":"e_1_3_2_1_11_1","first-page":"823","volume-title":"Proceedings of the 41st ACM SIGPLAN International Conference on Programming Language Design and Implementation, PLDI 2020","author":"Chou Stephen","year":"2020","unstructured":"Stephen Chou, Fredrik Kjolstad, and Saman P. Amarasinghe. Automatic generation of efficient sparse tensor format conversion routines. In Alastair F. Donaldson and Emina Torlak, editors, Proceedings of the 41st ACM SIGPLAN International Conference on Programming Language Design and Implementation, PLDI 2020, London, UK, June 15--20, 2020, pages 823--838. ACM, 2020."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_13_1","volume-title":"https:\/\/github.com\/onnx\/onnx","author":"ONNX","year":"2023","unstructured":"ONNX developers. ONNX. https:\/\/github.com\/onnx\/onnx, 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"TensorFlow FeatureColumn APIs. https:\/\/www.tensorflow.org\/api_docs\/python\/tf\/compat\/v1\/feature_column","year":"2023","unstructured":"Google. TensorFlow FeatureColumn APIs. https:\/\/www.tensorflow.org\/api_docs\/python\/tf\/compat\/v1\/feature_column, 2023."},{"key":"e_1_3_2_1_15_1","volume-title":"XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla","year":"2023","unstructured":"Google. XLA: Optimizing Compiler for Machine Learning. https:\/\/www.tensorflow.org\/xla, 2023."},{"key":"e_1_3_2_1_16_1","first-page":"26578","article-title":"Transient redundancy elimination-based convolution","volume":"35","author":"Guan Jiawei","year":"2022","unstructured":"Jiawei Guan, Feng Zhang, Jiesong Liu, Hsin-Hsuan Sung, Ruofan Wu, Xiaoyong Du, and Xipeng Shen. Trec: Transient redundancy elimination-based convolution. Advances in Neural Information Processing Systems, 35:26578--26589, 2022.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00084"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485505"},{"key":"e_1_3_2_1_20_1","first-page":"5","article-title":"Automatic load-compute pipelining in deep learning compiler for ai-gpus","author":"Huang Guyue","year":"2023","unstructured":"Guyue Huang, Yang Bai, Liu Liu, Yuke Wang, Bei Yu, Yufei Ding, and Yuan Xie. Alcop: Automatic load-compute pipelining in deep learning compiler for ai-gpus. Proceedings of Machine Learning and Systems, 5, 2023.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00083"},{"key":"e_1_3_2_1_22_1","volume-title":"Intel Performance Counter Monitor (Intel PCM). https:\/\/github.com\/intel\/pcm","year":"2023","unstructured":"Intel. Intel Performance Counter Monitor (Intel PCM). https:\/\/github.com\/intel\/pcm, 2023."},{"key":"e_1_3_2_1_23_1","volume-title":"OneAPI Deep Neural Network Library (oneDNN). https:\/\/github.com\/oneapi-src\/oneDNN","year":"2023","unstructured":"Intel. OneAPI Deep Neural Network Library (oneDNN). https:\/\/github.com\/oneapi-src\/oneDNN, 2023."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3326937.3341255"},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of Machine Learning and Systems 2021","author":"Jiang Wenqi","year":"2021","unstructured":"Wenqi Jiang, Zhenhao He, Shuai Zhang, Thomas B. Preu\u00dfer, Kai Zeng, Liang Feng, Jiansong Zhang, Tongxuan Liu, Yong Li, Jingren Zhou, Ce Zhang, and Gustavo Alonso. Microrec: Efficient recommendation inference by hardware and data structure solutions. In Alex Smola, Alex Dimakis, and Ion Stoica, editors, Proceedings of Machine Learning and Systems 2021, MLSys 2021, virtual, April 5--9, 2021. mlsys.org, 2021."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467139"},{"key":"e_1_3_2_1_27_1","volume-title":"Display Advertising Challenge. https:\/\/www.kaggle.com\/c\/criteo-display-ad-challenge","year":"2023","unstructured":"Kaggle. Display Advertising Challenge. https:\/\/www.kaggle.com\/c\/criteo-display-ad-challenge, 2023."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00019"},{"key":"e_1_3_2_1_30_1","first-page":"180","volume-title":"IEEE\/ACM International Symposium on Code Generation and Optimization, CGO 2019","author":"Kjolstad Fredrik","year":"2019","unstructured":"Fredrik Kjolstad, Peter Ahrens, Shoaib Kamil, and Saman P. Amarasinghe. Tensor algebra compilation with workspaces. In Mahmut Taylan Kandemir, Alexandra Jimborean, and Tipp Moseley, editors, IEEE\/ACM International Symposium on Code Generation and Optimization, CGO 2019, Washington, DC, USA, February 16--20, 2019, pages 180--192. IEEE, 2019."},{"key":"e_1_3_2_1_31_1","volume-title":"Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25","author":"Krizhevsky Alex","year":"2012","unstructured":"Alex Krizhevsky, Ilya Sutskever, and Geoffrey E Hinton. Imagenet classification with deep convolutional neural networks. Advances in neural information processing systems, 25, 2012."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527386"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3030548"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3336191.3371785"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582062"},{"key":"e_1_3_2_1_37_1","first-page":"881","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Ma Lingxiao","year":"2020","unstructured":"Lingxiao Ma, Zhiqiang Xie, Zhi Yang, Jilong Xue, Youshan Miao, Wei Cui, Wenxiang Hu, Fan Yang, Lintao Zhang, and Lidong Zhou. Rammer: Enabling holistic deep learning compiler optimizations with rtasks. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020, pages 881--897. USENIX Association, 2020."},{"key":"e_1_3_2_1_38_1","volume-title":"Pytorch domain library for recommendation systems. https:\/\/github.com\/pytorch\/torchrec","year":"2023","unstructured":"Meta. Pytorch domain library for recommendation systems. https:\/\/github.com\/pytorch\/torchrec, 2023."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_1_40_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. Deep learning recommendation model for personalization and recommendation systems. CoRR abs\/1906.00091 2019."},{"key":"e_1_3_2_1_41_1","volume-title":"Getting Started with CUDA Graphs. https:\/\/developer.nvidia.com\/blog\/cuda-graphs\/","author":"NVIDIA.","year":"2019","unstructured":"NVIDIA. Getting Started with CUDA Graphs. https:\/\/developer.nvidia.com\/blog\/cuda-graphs\/, 2019."},{"key":"e_1_3_2_1_42_1","volume-title":"Kernel Profiling Guide. https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. Kernel Profiling Guide. https:\/\/docs.nvidia.com\/nsight-compute\/ProfilingGuide, 2023."},{"key":"e_1_3_2_1_43_1","volume-title":"https:\/\/github.com\/NVIDIA-Merlin\/HugeCTR","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. NVIDIA-Merlin\/HugeCTR. https:\/\/github.com\/NVIDIA-Merlin\/HugeCTR, 2023."},{"key":"e_1_3_2_1_44_1","volume-title":"NVIDIA System Management Interface. https:\/\/developer.nvidia.com\/nvidia-system-management-interface","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. NVIDIA System Management Interface. https:\/\/developer.nvidia.com\/nvidia-system-management-interface, 2023."},{"key":"e_1_3_2_1_45_1","volume-title":"Programming Guide :: CUDA Toolkit Documentation. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. Programming Guide :: CUDA Toolkit Documentation. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html, 2023."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414656"},{"issue":"11","key":"e_1_3_2_1_47_1","first-page":"3015","article-title":"A GPU-Based Sub-Linear Deep Learning Engine via LSH Sparsification","volume":"33","author":"Pan Zaifeng","year":"2021","unstructured":"Zaifeng Pan, Feng Zhang, Hourun Li, Chenyang Zhang, Xiaoyong Du, and Dong Deng. G-SLIDE: A GPU-Based Sub-Linear Deep Learning Engine via LSH Sparsification. IEEE Transactions on Parallel and Distributed Systems, 33(11):3015--3027, 2021.","journal-title":"IEEE Transactions on Parallel and Distributed Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00045"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3428226"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_1_51_1","first-page":"208","article-title":"Efficiently compiling dynamic neural networks for model inference","volume":"3","author":"Shen Haichen","year":"2021","unstructured":"Haichen Shen, Jared Roesch, Zhi Chen, Wei Chen, Yong Wu, Mu Li, Vin Sharma, Zachary Tatlock, and Yida Wang. Nimble: Efficiently compiling dynamic neural networks for model inference. Proceedings of Machine Learning and Systems, 3:208--222, 2021.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1007\/s41019-022-00195-3"},{"key":"e_1_3_2_1_53_1","first-page":"821","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Sima Chijun","year":"2022","unstructured":"Chijun Sima, Yao Fu, Man-Kit Sit, Liyi Guo, Xuri Gong, Feng Lin, Junyu Wu, Yongsheng Li, Haidong Rong, Pierre-Louis Aublin, and Luo Mai. Ekko: A large-scale deep learning recommender system with low-latency model update. In Marcos K. Aguilera and Hakim Weatherspoon, editors, 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11--13, 2022, pages 821--839. USENIX Association, 2022."},{"key":"e_1_3_2_1_54_1","volume-title":"SymEngine: a fast C++ symbolic manipulation library. https:\/\/github.com\/symengine\/symengine","year":"2023","unstructured":"SymEngine. SymEngine: a fast C++ symbolic manipulation library. https:\/\/github.com\/symengine\/symengine, 2023."},{"key":"e_1_3_2_1_55_1","first-page":"2","volume-title":"GPU Technology Conference","volume":"1","author":"Vanholder Han","year":"2016","unstructured":"Han Vanholder. Efficient inference with tensorrt. In GPU Technology Conference, volume 1, page 2, 2016."},{"key":"e_1_3_2_1_56_1","volume-title":"Attention is all you need. Advances in neural information processing systems, 30","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2023.3307795"},{"key":"e_1_3_2_1_58_1","volume-title":"Deep bayesian multi-target learning for recommender systems. CoRR, abs\/1902.09154","author":"Wang Qi","year":"2019","unstructured":"Qi Wang, Zhihui Ji, Huasheng Liu, and Binqiang Zhao. Deep bayesian multi-target learning for recommender systems. CoRR, abs\/1902.09154, 2019."},{"key":"e_1_3_2_1_59_1","first-page":"515","volume-title":"15th USENIX symposium on operating systems design and implementation (OSDI 21)","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. {GNNAdvisor }: An adaptive and efficient runtime system for {GNN} acceleration on {GPUs }. In 15th USENIX symposium on operating systems design and implementation (OSDI 21), pages 515--531, 2021."},{"key":"e_1_3_2_1_60_1","first-page":"779","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Wang Yuke","year":"2023","unstructured":"Yuke Wang, Boyuan Feng, Zheng Wang, Tong Geng, Kevin Barker, Ang Li, and Yufei Ding. {MGG}: Accelerating graph neural networks with {Fine-Grained}{Intra-Kernel}{Communication-Computation} pipelining on {Multi-GPU} platforms. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23), pages 779--795, 2023."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00075"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485447.3511985"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519554"},{"key":"e_1_3_2_1_65_1","volume-title":"Proceedings of Machine Learning and Systems 2022","author":"Xing Jiarong","year":"2022","unstructured":"Jiarong Xing, Leyuan Wang, Shang Zhang, Jack Chen, Ang Chen, and Yibo Zhu. Bolt: Bridging the gap between auto-tuners and hardware-native performance. In Diana Marculescu, Yuejie Chi, and Carole-Jean Wu, editors, Proceedings of Machine Learning and Systems 2022, MLSys 2022, Santa Clara, CA, USA, August 29 - September 1, 2022. mlsys.org, 2022."},{"key":"e_1_3_2_1_66_1","first-page":"2404","volume-title":"SIGMOD '21: International Conference on Management of Data","author":"Xu Zhiqiang","year":"2021","unstructured":"Zhiqiang Xu, Dong Li, Weijie Zhao, Xing Shen, Tianbo Huang, Xiaoyun Li, and Ping Li. Agile and accurate CTR prediction model training for massive-scale online advertising systems. In Guoliang Li, Zhanhuai Li, Stratos Idreos, and Divesh Srivastava, editors, SIGMOD '21: International Conference on Management of Data, Virtual Event, China, June 20--25, 2021, pages 2404--2409. ACM, 2021."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2022.3201531"},{"key":"e_1_3_2_1_68_1","first-page":"841","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Zeng Chaoliang","year":"2022","unstructured":"Chaoliang Zeng, Layong Luo, Qingsong Ning, Yaodong Han, Yuhang Jiang, Ding Tang, Zilong Wang, Kai Chen, and Chuanxiong Guo. FAERY: an fpga-accelerated embedding-based retrieval system. In Marcos K. Aguilera and Hakim Weatherspoon, editors, 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11--13, 2022, pages 841--856. USENIX Association, 2022."},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS47924.2020.00057"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDE53745.2022.00324"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_72_1","volume-title":"Proceedings of Machine Learning and Systems 2022","author":"Zheng Bojian","year":"2022","unstructured":"Bojian Zheng, Ziheng Jiang, Cody Hao Yu, Haichen Shen, Joshua Fromm, Yizhi Liu, Yida Wang, Luis Ceze, Tianqi Chen, and Gennady Pekhimenko. Dietcode: Automatic optimization for dynamic tensor programs. In Diana Marculescu, Yuejie Chi, and Carole-Jean Wu, editors, Proceedings of Machine Learning and Systems 2022, MLSys 2022, Santa Clara, CA, USA, August 29 - September 1, 2022. mlsys.org, 2022."},{"key":"e_1_3_2_1_73_1","first-page":"863","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. Ansor: Generating high-performance tensor programs for deep learning. In 14th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2020, Virtual Event, November 4--6, 2020, pages 863--879. USENIX Association, 2020."},{"key":"e_1_3_2_1_74_1","first-page":"859","article-title":"An automatic schedule exploration and optimization framework for tensor computation on heterogeneous system. In James R. Larus, Luis Ceze, and Karin Strauss, editors, ASPLOS '20: Architectural Support for Programming Languages and Operating Systems, Lausanne","volume":"2020","author":"Zheng Size","year":"2020","unstructured":"Size Zheng, Yun Liang, Shuo Wang, Renze Chen, and Kaiwen Sheng. Flextensor: An automatic schedule exploration and optimization framework for tensor computation on heterogeneous system. In James R. Larus, Luis Ceze, and Karin Strauss, editors, ASPLOS '20: Architectural Support for Programming Languages and Operating Systems, Lausanne, Switzerland, March 16--20, 2020, pages 859--873. ACM, 2020.","journal-title":"Switzerland, March 16--20"},{"key":"e_1_3_2_1_75_1","first-page":"587","volume-title":"Proceedings of the 50th Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 2017","author":"Zheng Zhen","year":"2017","unstructured":"Zhen Zheng, Chanyoung Oh, Jidong Zhai, Xipeng Shen, Youngmin Yi, and Wenguang Chen. Versapipe: a versatile programming framework for pipelined computing on GPU. In Hillery C. Hunter, Jaime Moreno, Joel S. Emer, and Daniel S\u00e1nchez, editors, Proceedings of the 50th Annual IEEE\/ACM International Symposium on Microarchitecture, MICRO 2017, Cambridge, MA, USA, October 14--18, 2017, pages 587--599. ACM, 2017."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1145\/3297858.3304032"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/3617327"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015941"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219823"},{"key":"e_1_3_2_1_81_1","first-page":"233","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. ROLLER: fast and efficient tensor compilation for deep learning. In Marcos K. Aguilera and Hakim Weatherspoon, editors, 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11--13, 2022, pages 233--248. USENIX Association, 2022."},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458838"},{"key":"e_1_3_2_1_83_1","first-page":"316","article-title":"Randomness in neural network training: Characterizing the impact of tooling","volume":"4","author":"Zhuang Donglin","year":"2022","unstructured":"Donglin Zhuang, Xingyao Zhang, Shuaiwen Song, and Sara Hooker. Randomness in neural network training: Characterizing the impact of tooling. Proceedings of Machine Learning and Systems, 4:316--336, 2022.","journal-title":"Proceedings of Machine Learning and Systems"}],"event":{"name":"ASPLOS '23: 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4","location":"Vancouver BC Canada","acronym":"ASPLOS '23","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 4"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623278.3624761","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3623278.3624761","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T16:36:26Z","timestamp":1750178186000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3623278.3624761"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,3,25]]},"references-count":83,"alternative-id":["10.1145\/3623278.3624761","10.1145\/3623278"],"URL":"https:\/\/doi.org\/10.1145\/3623278.3624761","relation":{},"subject":[],"published":{"date-parts":[[2023,3,25]]},"assertion":[{"value":"2024-02-07","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}