{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,16]],"date-time":"2026-06-16T15:05:05Z","timestamp":1781622305213,"version":"3.54.5"},"publisher-location":"New York, NY, USA","reference-count":42,"publisher":"ACM","license":[{"start":{"date-parts":[[2022,9,13]],"date-time":"2022-09-13T00:00:00Z","timestamp":1663027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2022,9,18]]},"DOI":"10.1145\/3523227.3546765","type":"proceedings-article","created":{"date-parts":[[2022,9,13]],"date-time":"2022-09-13T14:13:46Z","timestamp":1663078426000},"page":"408-419","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":19,"title":["A GPU-specialized Inference Parameter Server for Large-Scale Deep Recommendation Models"],"prefix":"10.1145","author":[{"given":"Yingcan","family":"Wei","sequence":"first","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Matthias","family":"Langer","sequence":"additional","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Fan","family":"Yu","sequence":"additional","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Minseok","family":"Lee","sequence":"additional","affiliation":[{"name":"NVIDIA, Korea, Republic of"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Jie","family":"Liu","sequence":"additional","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ji","family":"Shi","sequence":"additional","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zehuan","family":"Wang","sequence":"additional","affiliation":[{"name":"NVIDIA, China"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"320","published-online":{"date-parts":[[2022,9,13]]},"reference":[{"key":"e_1_3_2_2_1_1","volume-title":"Beyond Databases, Architectures and Structures. Facing the Challenges of Data Proliferation and Growing Variety","author":"Arefyeva Iya","unstructured":"Iya Arefyeva, David Broneske, Gabriel Campero, Marcus Pinnecke, and Gunter Saake. 2018. Memory Management Strategies in CPU\/GPU Database Systems: A Survey. In Beyond Databases, Architectures and Structures. Facing the Challenges of Data Proliferation and Growing Variety. Springer International Publishing, Cham, 128\u2013142."},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/3320060"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1137\/070710111"},{"key":"e_1_3_2_2_4_1","unstructured":"Yann Collet. 2014. xxHash Hash Function. https:\/\/www.xxhash.com. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_5_1","volume-title":"Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17)","author":"Crankshaw Daniel","year":"2017","unstructured":"Daniel Crankshaw, Xin Wang, Guilio Zhou, Michael\u00a0J. Franklin, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2017. Clipper: A Low-Latency Online Prediction Serving System. In 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17). USENIX Association, Boston, MA, 613\u2013627."},{"key":"e_1_3_2_2_6_1","unstructured":"Criteo AI Lab. 2014. Criteo 1TB Click Logs dataset. https:\/\/www.kaggle.com\/c\/criteo-display-ad-challenge. Accessed: 2022-03-15."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2901318.2901323"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.1109\/MLHPC54614.2021.00009"},{"key":"e_1_3_2_2_9_1","unstructured":"David Goodwin 2021. NVIDIA Triton: Model Control Mode. https:\/\/github.com\/triton-inference-server\/server\/blob\/main\/docs\/model_management.md#model-control-mode-explicit. Accessed: 2022-07-19."},{"key":"e_1_3_2_2_10_1","unstructured":"David Goodwin 2021. Triton Architecture - Concurrent Model Execution. https:\/\/github.com\/triton-inference-server\/server\/blob\/main\/docs\/architecture.md#concurrent-model-execution. Accessed: 2022-07-19."},{"key":"e_1_3_2_2_11_1","unstructured":"David Goodwin 2022. NVIDIA Triton: Performance Analyzer. https:\/\/github.com\/triton-inference-server\/server\/blob\/main\/docs\/perf_analyzer.md. Accessed: 2022-07-19."},{"key":"e_1_3_2_2_12_1","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3462976"},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00084"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467080"},{"key":"e_1_3_2_2_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2827872"},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1145\/3177732.3177734"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1093\/nsr"},{"key":"e_1_3_2_2_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3447548.3467139"},{"key":"e_1_3_2_2_19_1","unstructured":"David Kanter Peter Mattson 2021. ML \u00b7 Commons \/ MLperf v1.1 Results. https:\/\/mlcommons.org\/en\/training-normal-11. Accessed: 2022-03-15."},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3003307"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS51385.2021.00033"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267840"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/2788396"},{"key":"e_1_3_2_2_24_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun\u00a0Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson\u00a0G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091(2019) 10\u00a0pages. http:\/\/arxiv.org\/abs\/1906.00091"},{"key":"e_1_3_2_2_25_1","unstructured":"NVIDA. 2022. HugeCTR: Distributed deployment with the Hierarchical Parameter Server. https:\/\/github.com\/triton-inference-server\/hugectr_backend\/blob\/main\/docs\/architecture.md#distributed-deployment-with-hierarchical-hugectr-parameter-server. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_26_1","unstructured":"NVIDA. 2022. HugeCTR Hierarchical Parameter Server. https:\/\/github.com\/triton-inference-server\/hugectr_backend\/blob\/main\/docs\/hierarchical_parameter_server.md#hugectr-hierarchical-parameter-server. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_27_1","unstructured":"NVIDIA. 2020. NVIDIA A100 Tensor Core GPU. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf. 82\u00a0pages."},{"key":"e_1_3_2_2_28_1","unstructured":"NVIDIA. 2022. CUDA C Programming Guide. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html#simt-architecture. 131\u00a0pages. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_29_1","unstructured":"NVIDIA. 2022. HugeCTR: Embedding Cache Asynchronous Insertion Mechanism. https:\/\/github.com\/triton-inference-server\/hugectr_backend#embedding-cache-asynchronous-insertion-mechanism. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_30_1","unstructured":"NVIDIA. 2022. HugeCTR: GPU Cache. https:\/\/github.com\/NVIDIA-Merlin\/HugeCTR\/tree\/master\/gpu_cache. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_31_1","unstructured":"NVIDIA. 2022. HugeCTR: Triton Backend. https:\/\/github.com\/triton-inference-server\/hugectr_backend. Accessed: 2022-05-15."},{"key":"e_1_3_2_2_32_1","unstructured":"NVidia Corp.2022. Solutions for the Data-Center. https:\/\/www.nvidia.com\/en-us\/data-center. Accessed: 2022-04-15."},{"key":"e_1_3_2_2_33_1","unstructured":"NVIDIA Deep Learning Examples for Tensor Cores. 2022. Deploying the DLRM model using Triton Inference Server. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/PyTorch\/Recommendation\/DLRM\/triton#performance."},{"key":"e_1_3_2_2_34_1","unstructured":"NVIDIA Deep Learning Examples for Tensor Cores. 2022. DLRM For TensorFlow 2. https:\/\/github.com\/NVIDIA\/DeepLearningExamples\/tree\/master\/TensorFlow2\/Recommendation\/DLRM#inference-performance-results."},{"key":"e_1_3_2_2_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3485126"},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","unstructured":"Matthias\u00a0J. Sax. 2018. Apache Kafka. Springer International Publishing Cham 1\u20138. https:\/\/doi.org\/10.1007\/978-3-319-63962-8_196-1","DOI":"10.1007\/978-3-319-63962-8_196-1"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDEW53142.2021.00014"},{"key":"e_1_3_2_2_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3357895"},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.14778\/2536206.2536210"},{"key":"e_1_3_2_2_42_1","first-page":"412","article-title":"Distributed Hierarchical GPU Parameter Server for Massive Scale Deep Learning Ads Systems","volume":"2","author":"Zhao Weijie","year":"2020","unstructured":"Weijie Zhao, Deping Xie, Ronglai Jia, Yulei Qian, Ruiquan Ding, Mingming Sun, and Ping Li. 2020. Distributed Hierarchical GPU Parameter Server for Massive Scale Deep Learning Ads Systems. Proceedings of Machine Learning and Systems 2 (2020), 412\u2013428.","journal-title":"Proceedings of Machine Learning and Systems"}],"event":{"name":"RecSys '22: Sixteenth ACM Conference on Recommender Systems","location":"Seattle WA USA","acronym":"RecSys '22","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval","SIGCHI ACM Special Interest Group on Computer-Human Interaction"]},"container-title":["Proceedings of the 16th ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3523227.3546765","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3523227.3546765","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T19:30:45Z","timestamp":1750188645000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3523227.3546765"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2022,9,13]]},"references-count":42,"alternative-id":["10.1145\/3523227.3546765","10.1145\/3523227"],"URL":"https:\/\/doi.org\/10.1145\/3523227.3546765","relation":{},"subject":[],"published":{"date-parts":[[2022,9,13]]},"assertion":[{"value":"2022-09-13","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}