{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:40:01Z","timestamp":1755877201729,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":64,"publisher":"ACM","funder":[{"name":"the National Natural Science Foundation of China under Grants","award":["62471055, 62406039, 62401080, U23B2001, 62321001, 62201072, 62171057"],"award-info":[{"award-number":["62471055, 62406039, 62401080, U23B2001, 62321001, 62201072, 62171057"]}]},{"name":"National Key R&D Program of China","award":["2024YFE0200800"],"award-info":[{"award-number":["2024YFE0200800"]}]},{"name":"the Ministry of Education and China Mobile Joint Fund","award":["MCM20200202, MCM20180101"],"award-info":[{"award-number":["MCM20200202, MCM20180101"]}]},{"name":"the Fundamental Research Funds for the Central Universities","award":["2024PTB-004"],"award-info":[{"award-number":["2024PTB-004"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2025,7,16]]},"DOI":"10.1145\/3694906.3743310","type":"proceedings-article","created":{"date-parts":[[2025,7,16]],"date-time":"2025-07-16T16:19:56Z","timestamp":1752682796000},"page":"473-486","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["RACE: Operator Choreography for Inference Acceleration in Personalized Recommender System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0005-2542-8542","authenticated-orcid":false,"given":"Shaolong","family":"Li","sequence":"first","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8300-0270","authenticated-orcid":false,"given":"Xiang","family":"Yang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5701-6417","authenticated-orcid":false,"given":"Jiaxing","family":"Liu","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0829-4624","authenticated-orcid":false,"given":"Qi","family":"Qi","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1486-0573","authenticated-orcid":false,"given":"Jianxin","family":"Liao","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-6656-7120","authenticated-orcid":false,"given":"Jun","family":"Huang","sequence":"additional","affiliation":[{"name":"Meituan Corporation, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2182-2228","authenticated-orcid":false,"given":"Jingyu","family":"Wang","sequence":"additional","affiliation":[{"name":"State Key Laboratory of Networking and Switching Technology, Beijing, University of Posts and Telecommunications, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2025,7,16]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"accessed 2023. cuDF - GPU DataFrames. https:\/\/github.com\/rapidsai\/cudf."},{"key":"e_1_3_2_1_2_1","unstructured":"accessed 2023. NVIDIA cuCollections. https:\/\/github.com\/NVIDIA\/cuCollections."},{"key":"e_1_3_2_1_3_1","unstructured":"accessed 2023. TensorFlow XLA. https:\/\/www.tensorflow.org\/xla."},{"key":"e_1_3_2_1_4_1","unstructured":"accessed 2023. TensorRT in TensorFlow. https:\/\/github.com\/tensorflow\/tensorrt."},{"key":"e_1_3_2_1_5_1","volume-title":"Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg S Corrado, Andy Davis, Jeffrey Dean, Matthieu Devin, et al. 2016. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467 (2016)."},{"key":"e_1_3_2_1_6_1","unstructured":"Mart\u00edn Abadi Ashish Agarwal Paul Barham Eugene Brevdo Zhifeng Chen Craig Citro Greg S. Corrado Andy Davis Jeffrey Dean Matthieu Devin Sanjay Ghemawat Ian Goodfellow Andrew Harp Geoffrey Irving Michael Isard Yangqing Jia Rafal Jozefowicz Lukasz Kaiser Manjunath Kudlur Josh Levenberg Dandelion Man\u00e9 Rajat Monga Sherry Moore Derek Murray Chris Olah Mike Schuster Jonathon Shlens Benoit Steiner Ilya Sutskever Kunal Talwar Paul Tucker Vincent Vanhoucke Vijay Vasudevan Fernanda Vi\u00e9gas Oriol Vinyals Pete Warden Martin Wattenberg Martin Wicke Yuan Yu and Xiaoqiang Zheng. 2015. TensorFlow: Large-Scale Machine Learning on Heterogeneous Systems. https:\/\/www.tensorflow.org\/ Software available from tensorflow.org."},{"key":"e_1_3_2_1_7_1","volume-title":"NIPS Machine Learning for Systems Workshop.","author":"Addanki Ravichandra","year":"2018","unstructured":"Ravichandra Addanki, Shaileshh Bojja Venkatakrishnan, Shreyan Gupta, Hongzi Mao, and Mohammad Alizadeh. 2018. Placeto: Efficient progressive device placement optimization. In NIPS Machine Learning for Systems Workshop."},{"key":"e_1_3_2_1_8_1","volume-title":"Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 499--514","author":"Bai Zhihao","year":"2020","unstructured":"Zhihao Bai, Zhen Zhang, Yibo Zhu, and Xin Jin. 2020. Pipeswitch: Fast pipelined context switching for deep learning applications. In Proceedings of the 14th USENIX Conference on Operating Systems Design and Implementation. 499--514."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2004.60"},{"key":"e_1_3_2_1_10_1","volume-title":"TVM: An automated end-to-end optimizing compiler for deep learning. arXiv preprint arXiv:1802.04799","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, et al. 2018. TVM: An automated end-to-end optimizing compiler for deep learning. arXiv preprint arXiv:1802.04799 (2018)."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467220"},{"key":"e_1_3_2_1_12_1","volume-title":"Learning to optimize tensor programs. Advances in Neural Information Processing Systems 31","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Lianmin Zheng, Eddie Yan, Ziheng Jiang, Thierry Moreau, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. Learning to optimize tensor programs. Advances in Neural Information Processing Systems 31 (2018)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737486"},{"key":"e_1_3_2_1_16_1","volume-title":"International Conference on Machine Learning. PMLR, 1676--1684","author":"Gao Yuanxiang","year":"2018","unstructured":"Yuanxiang Gao, Li Chen, and Baochun Li. 2018. Spotlight: Optimizing device placement for training deep neural networks. In International Conference on Machine Learning. PMLR, 1676--1684."},{"key":"e_1_3_2_1_17_1","volume-title":"Post-training 4-bit quantization on embedding tables. arXiv preprint arXiv:1911.02079","author":"Guan Hui","year":"2019","unstructured":"Hui Guan, Andrey Malevich, Jiyan Yang, Jongsoo Park, and Hector Yuen. 2019. Post-training 4-bit quantization on embedding tables. arXiv preprint arXiv:1911.02079 (2019)."},{"key":"e_1_3_2_1_18_1","unstructured":"Huifeng Guo Ruiming Tang Yunming Ye Zhenguo Li and Xiuqiang He. 2017. DeepFM: a factorization-machine based neural network for CTR prediction. arXiv preprint arXiv:1703.04247 (2017)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3077136.3080777"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM.2019.8737614"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM48880.2022.9796763"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547387"},{"key":"e_1_3_2_1_25_1","unstructured":"Zhihao Jia Sina Lin Charles R Qi and Alex Aiken. 2018. Exploring Hidden Dimensions in Parallelizing Convolutional Neural Networks.. In ICML. 2279--2288."},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_27_1","first-page":"1","article-title":"Beyond Data and Model Parallelism for Deep Neural Networks","volume":"1","author":"Jia Zhihao","year":"2019","unstructured":"Zhihao Jia, Matei Zaharia, and Alex Aiken. 2019. Beyond Data and Model Parallelism for Deep Neural Networks. Proceedings of Machine Learning and Systems 1 (2019), 1--13.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462941"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467304"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.1109\/MC.2009.263"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1109\/CGO51591.2021.9370308"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v37i4.25564"},{"key":"e_1_3_2_1_33_1","volume-title":"17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph E Gonzalez, et al. 2023. {AlpaServe}: Statistical multiplexing with model parallelism for deep learning serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). 663--679."},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3366423.3380151"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539058"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3511808.3557411"},{"key":"e_1_3_2_1_37_1","volume-title":"International Conference on Machine Learning. PMLR, 2430--2439","author":"Mirhoseini Azalia","year":"2017","unstructured":"Azalia Mirhoseini, Hieu Pham, Quoc V Le, Benoit Steiner, Rasmus Larsen, Yuefeng Zhou, Naveen Kumar, Mohammad Norouzi, Samy Bengio, and Jeff Dean. 2017. Device placement optimization with reinforcement learning. In International Conference on Machine Learning. PMLR, 2430--2439."},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_39_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091 (2019). https:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3623278.3624761"},{"volume-title":"SC24: International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Pan Zaifeng","key":"e_1_3_2_1_41_1","unstructured":"Zaifeng Pan, Zhen Zheng, Feng Zhang, Bing Xie, Ruofan Wu, Shaden Smith, Chuanjie Liu, Olatunji Ruwase, Xiaoyong Du, and Yufei Ding. 2024. RecFlex: Enabling Feature Heterogeneity-Aware Optimization for Deep Recommendation Models with Flexible Schedules. In SC24: International Conference for High Performance Computing, Networking, Storage and Analysis. IEEE, 1--15."},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/2499370.2462176"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_1_44_1","first-page":"208","article-title":"Nimble: Efficiently compiling dynamic neural networks for model inference","volume":"3","author":"Shen Haichen","year":"2021","unstructured":"Haichen Shen, Jared Roesch, Zhi Chen, Wei Chen, Yong Wu, Mu Li, Vin Sharma, Zachary Tatlock, and Yida Wang. 2021. Nimble: Efficiently compiling dynamic neural networks for model inference. Proceedings of Machine Learning and Systems 3 (2021), 208--222.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_45_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. 2019. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3397271.3401125"},{"key":"e_1_3_2_1_47_1","first-page":"15451","article-title":"Efficient algorithms for device placement of dnn graph operators","volume":"33","author":"Tarnawski Jakub M","year":"2020","unstructured":"Jakub M Tarnawski, Amar Phanishayee, Nikhil Devanur, Divya Mahajan, and Fanny Nina Paravecino. 2020. Efficient algorithms for device placement of dnn graph operators. Advances in Neural Information Processing Systems 33 (2020), 15451--15463.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOM42981.2021.9488726"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1109\/INFOCOMWKSHPS51825.2021.9484510"},{"key":"e_1_3_2_1_50_1","volume-title":"2024 USENIX Annual Technical Conference (USENIX ATC 24)","author":"Wang Zheng","year":"2024","unstructured":"Zheng Wang, Yuke Wang, Boyuan Feng, Guyue Huang, Dheevatsa Mudigere, Bharath Muthiah, Ang Li, and Yufei Ding. 2024. {OPER}:{Optimality-Guided} Embedding Table Parallelization for Large-scale Recommendation Model. In 2024 USENIX Annual Technical Conference (USENIX ATC 24). 667--682."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3546765"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3434304"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3477495.3531775"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1145\/3448016.3457236"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2012.192"},{"key":"e_1_3_2_1_56_1","first-page":"255","article-title":"Equality saturation for tensor graph superoptimization","volume":"3","author":"Yang Yichen","year":"2021","unstructured":"Yichen Yang, Phitchaya Phothilimthana, Yisu Wang, Max Willsey, Sudip Roy, and Jacques Pienaar. 2021. Equality saturation for tensor graph superoptimization. Proceedings of Machine Learning and Systems 3 (2021), 255--268.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_57_1","first-page":"15190","article-title":"Dreamshard: Generalizable embedding table placement for recommender systems","volume":"35","author":"Zha Daochen","year":"2022","unstructured":"Daochen Zha, Louis Feng, Qiaoyu Tan, Zirui Liu, Kwei-Herng Lai, Bhargav Bhushanam, Yuandong Tian, Arun Kejariwal, and Xia Hu. 2022. Dreamshard: Generalizable embedding table placement for recommender systems. Advances in Neural Information Processing Systems 35 (2022), 15190--15203.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_58_1","volume-title":"EdgeNN: Efficient Neural Network Inference for CPU-GPU Integrated Edge Devices. In 2023 IEEE 39th International Conference on Data Engineering (ICDE). IEEE, 1193--1207","author":"Zhang Chenyang","year":"2023","unstructured":"Chenyang Zhang, Feng Zhang, Kuangyu Chen, Mingjun Chen, Bingsheng He, and Xiaoyong Du. 2023. EdgeNN: Efficient Neural Network Inference for CPU-GPU Integrated Edge Devices. In 2023 IEEE 39th International Conference on Data Engineering (ICDE). IEEE, 1193--1207."},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM51629.2021.00101"},{"key":"e_1_3_2_1_60_1","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zheng Lianmin","year":"2022","unstructured":"Lianmin Zheng, Zhuohan Li, Hao Zhang, Yonghao Zhuang, Zhifeng Chen, Yanping Huang, Yida Wang, Yuanzhong Xu, Danyang Zhuo, Eric P Xing, et al. 2022. Alpa: Automating inter-and {Intra-Operator} parallelism for distributed deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). 559--578."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33015941"},{"key":"e_1_3_2_1_62_1","volume-title":"Gdp: Generalized device placement for dataflow graphs. arXiv preprint arXiv:1910.01578","author":"Zhou Yanqi","year":"2019","unstructured":"Yanqi Zhou, Sudip Roy, Amirali Abdolrashidi, Daniel Wong, Peter C Ma, Qiumin Xu, Ming Zhong, Hanxiao Liu, Anna Goldie, Azalia Mirhoseini, et al. 2019. Gdp: Generalized device placement for dataflow graphs. arXiv preprint arXiv:1910.01578 (2019)."},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437984.3458838"},{"key":"e_1_3_2_1_64_1","volume-title":"18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24)","author":"Zhuang Donglin","year":"2024","unstructured":"Donglin Zhuang, Zhen Zheng, Haojun Xia, Xiafei Qiu, Junjie Bai, Wei Lin, and Shuaiwen Leon Song. 2024. {MonoNN}: Enabling a New Monolithic Optimization Space for Neural Network Inference Tasks on Modern {GPU-Centric} Architectures. In 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24). 989--1005."}],"event":{"name":"SPAA '25: 37th ACM Symposium on Parallelism in Algorithms and Architectures","sponsor":["SIGACT ACM Special Interest Group on Algorithms and Computation Theory","SIGARCH ACM Special Interest Group on Computer Architecture","EATCS European Association for Theoretical Computer Science"],"location":"Portland OR USA","acronym":"SPAA '25"},"container-title":["Proceedings of the 37th ACM Symposium on Parallelism in Algorithms and Architectures"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3694906.3743310","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:17:10Z","timestamp":1755875830000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3694906.3743310"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,7,16]]},"references-count":64,"alternative-id":["10.1145\/3694906.3743310","10.1145\/3694906"],"URL":"https:\/\/doi.org\/10.1145\/3694906.3743310","relation":{},"subject":[],"published":{"date-parts":[[2025,7,16]]},"assertion":[{"value":"2025-07-16","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}