{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,10]],"date-time":"2025-09-10T22:23:15Z","timestamp":1757542995073,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":51,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,8]],"date-time":"2024-10-08T00:00:00Z","timestamp":1728345600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,8]]},"DOI":"10.1145\/3640457.3688111","type":"proceedings-article","created":{"date-parts":[[2024,10,8]],"date-time":"2024-10-08T15:39:28Z","timestamp":1728401968000},"page":"622-632","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":2,"title":["Embedding Optimization for Training Large-scale Deep Learning Recommendation Systems with EMBark"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0007-5431-5787","authenticated-orcid":false,"given":"Shijie","family":"Liu","sequence":"first","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3261-2135","authenticated-orcid":false,"given":"Nan","family":"Zheng","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6797-6422","authenticated-orcid":false,"given":"Hui","family":"Kang","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-4356-7054","authenticated-orcid":false,"given":"Xavier","family":"Simmons","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, USA"}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-5065-6986","authenticated-orcid":false,"given":"Junjie","family":"Zhang","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1776-8000","authenticated-orcid":false,"given":"Matthias","family":"Langer","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-9004-1649","authenticated-orcid":false,"given":"Wenjing","family":"Zhu","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8367-1939","authenticated-orcid":false,"given":"Minseok","family":"Lee","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1072-2651","authenticated-orcid":false,"given":"Zehuan","family":"Wang","sequence":"additional","affiliation":[{"name":"Hardware, NVIDIA Corporation, China"}]}],"member":"320","published-online":{"date-parts":[[2024,10,8]]},"reference":[{"key":"e_1_3_2_2_1_1","doi-asserted-by":"publisher","DOI":"10.1145\/3088525.3088527"},{"key":"e_1_3_2_2_2_1","doi-asserted-by":"publisher","DOI":"10.14778\/3485450.3485462"},{"key":"e_1_3_2_2_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"e_1_3_2_2_4_1","volume-title":"Article 65","author":"Ben-Nun Tal","year":"2019","unstructured":"Tal Ben-Nun and Torsten Hoefler. 2019. Demystifying Parallel and Distributed Deep Learning: An In-Depth Concurrency Analysis. Comput. Surveys 52, 4, Article 65 (2019), 43\u00a0pages."},{"key":"e_1_3_2_2_5_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_2_6_1","volume-title":"McKinsey Global Institute","author":"Chui Michael","year":"2018","unstructured":"Michael Chui, James Manyika, Mehdi Miremadi, Nicolaus Henke, Rita Chung, Pieter Nel, and Sankalp Malhotra. McKinsey Global Institute, 2018. Notes from the AI frontier: Insights from hundreds of use cases. https:\/\/www.mckinsey.com\/featured-insights\/artificial-intelligence\/notes-from-the-ai-frontier-applications-and-value-of-deep-learning. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_2_8_1","doi-asserted-by":"publisher","DOI":"10.5555\/1953048.2021068"},{"key":"e_1_3_2_2_9_1","volume-title":"19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22)","author":"Eisenman Assaf","year":"2022","unstructured":"Assaf Eisenman, Kiran\u00a0Kumar Matam, Steven Ingram, Dheevatsa Mudigere, Raghuraman Krishnamoorthi, Krishnakumar Nair, Misha Smelyanskiy, and Murali Annavaram. 2022. Check-N-Run: a Checkpointing System for Training Deep Learning Recommendation Models. In 19th USENIX Symposium on Networked Systems Design and Implementation (NSDI 22). USENIX Association, Renton, WA, USA, 929\u2013943."},{"key":"e_1_3_2_2_10_1","doi-asserted-by":"publisher","DOI":"10.1109\/NoCS.2013.6558404"},{"key":"e_1_3_2_2_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/2843948"},{"key":"e_1_3_2_2_12_1","volume-title":"The Architectural Implications of Facebook\u2019s DNN-Based Personalized Recommendation. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA)","author":"Gupta Udit","year":"2020","unstructured":"Udit Gupta, Carole-Jean Wu, Xiaodong Wang, Maxim Naumov, Brandon Reagen, David Brooks, Bradford Cottel, Kim Hazelwood, Mark Hempstead, Bill Jia, Hsien-Hsin\u00a0S. Lee, Andrey Malevich, Dheevatsa Mudigere, Mikhail Smelyanskiy, Liang Xiong, and Xuan Zhang. 2020. The Architectural Implications of Facebook\u2019s DNN-Based Personalized Recommendation. In 2020 IEEE International Symposium on High Performance Computer Architecture (HPCA) (San Diego, CA, USA). IEEE Press, New York, NY, USA, 488\u2013501."},{"key":"e_1_3_2_2_13_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41586-020-2649-2"},{"key":"e_1_3_2_2_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547387"},{"key":"e_1_3_2_2_15_1","volume-title":"SPACE: Locality-Aware Processing in Heterogeneous Memory for Personalized Recommendations. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA)","author":"Kal Hongju","year":"2021","unstructured":"Hongju Kal, Seokmin Lee, Gun Ko, and Won\u00a0Woo Ro. 2021. SPACE: Locality-Aware Processing in Heterogeneous Memory for Personalized Recommendations. In 2021 ACM\/IEEE 48th Annual International Symposium on Computer Architecture (ISCA) (Valencia, Spain). IEEE Press, New York, NY, USA, 679\u2013691."},{"key":"e_1_3_2_2_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_2_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_2_18_1","volume-title":"AdaEmbed: Adaptive Embedding for Large-Scale Recommendation Models. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Lai Fan","year":"2023","unstructured":"Fan Lai, Wei Zhang, Rui Liu, William Tsai, Xiaohan Wei, Yuxi Hu, Sabin Devkota, Jianyu Huang, Jongsoo Park, Xing Liu, Zeliang Chen, Ellie Wen, Paul Rivera, Jie You, Chun cheng Jason\u00a0Chen, and Mosharaf Chowdhury. 2023. AdaEmbed: Adaptive Embedding for Large-Scale Recommendation Models. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 817\u2013831."},{"key":"e_1_3_2_2_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2020.3003307"},{"key":"e_1_3_2_2_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446717"},{"key":"e_1_3_2_2_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2019.2928289"},{"key":"e_1_3_2_2_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589310"},{"key":"e_1_3_2_2_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2020.2974843"},{"key":"e_1_3_2_2_24_1","unstructured":"Meta Corporation. 2024. FBGEMM Training Embedding Backward Split Kernel. https:\/\/github.com\/pytorch\/FBGEMM\/tree\/main\/fbgemm_gpu\/codegen\/training\/backward\/embedding_backward_split_kernel_warp_template.cu. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_25_1","doi-asserted-by":"publisher","DOI":"10.14778\/3489496.3489511"},{"key":"e_1_3_2_2_26_1","unstructured":"MLCommons Open ML\/AI Engineering Consortium. 2024. MLPerf Benchmarks. https:\/\/mlcommons.org\/benchmarks\/training. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2018.2880197"},{"key":"e_1_3_2_2_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_2_29_1","volume-title":"Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091","author":"Naumov Maxim","year":"2019","unstructured":"Maxim Naumov, Dheevatsa Mudigere, Hao-Jun\u00a0Michael Shi, Jianyu Huang, Narayanan Sundaraman, Jongsoo Park, Xiaodong Wang, Udit Gupta, Carole-Jean Wu, Alisson\u00a0G. Azzolini, Dmytro Dzhulgakov, Andrey Mallevich, Ilia Cherniavskii, Yinghai Lu, Raghuraman Krishnamoorthi, Ansha Yu, Volodymyr Kondratenko, Stephanie Pereira, Xianjie Chen, Wenlin Chen, Vijay Rao, Bill Jia, Liang Xiong, and Misha Smelyanskiy. 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. CoRR abs\/1906.00091 (2019), 10\u00a0pages."},{"key":"e_1_3_2_2_30_1","unstructured":"NVIDIA Corporation. 2024. CUDA C Programming Guide \/ Maximize Instruction Throughput. https:\/\/docs.nvidia.com\/cuda\/cuda-c-programming-guide\/index.html#maximize-instruction-throughput. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_31_1","unstructured":"NVIDIA Corporation. 2024. NVIDIA Collective Communications Library (NCCL). https:\/\/developer.nvidia.com\/nccl. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/2816815"},{"key":"e_1_3_2_2_33_1","doi-asserted-by":"publisher","DOI":"10.1145\/336992.337035"},{"key":"e_1_3_2_2_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_2_35_1","volume-title":"FlexShard: Flexible Sharding for Industry-Scale Sequence Recommendation Models. CoRR abs\/2301.02959","author":"Sethi Geet","year":"2023","unstructured":"Geet Sethi, Pallab Bhattacharya, Dhruv Choudhary, Carole-Jean Wu, and Christos Kozyrakis. 2023. FlexShard: Flexible Sharding for Industry-Scale Sequence Recommendation Models. CoRR abs\/2301.02959 (2023), 14\u00a0pages."},{"key":"e_1_3_2_2_36_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3403059"},{"key":"e_1_3_2_2_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2017.72"},{"key":"e_1_3_2_2_38_1","volume-title":"Emerj AI Sector Overviews","author":"Underwood Corinna","year":"2020","unstructured":"Corinna Underwood. Emerj AI Sector Overviews, 2020. Use Cases of Recommendation Systems in Business \u2013 Current Applications and Methods. https:\/\/emerj.com\/ai-sector-overviews\/use-cases-recommendation-systems. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219869"},{"key":"e_1_3_2_2_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3442381.3450078"},{"key":"e_1_3_2_2_41_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3547405"},{"key":"e_1_3_2_2_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3523227.3546765"},{"key":"e_1_3_2_2_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"e_1_3_2_2_44_1","volume-title":"Microsoft Research Lab \u2013 Asia","author":"Xie Xing","year":"2018","unstructured":"Xing Xie, Jianxun Lian, Zheng Liu, Xiting Wang, Fangzhao Wu, Hongwei Wang, and Zhongxia Chen. Microsoft Research Lab \u2013 Asia, 2018. Personalized Recommendation Systems: Five Hot Research Topics You Must Know. https:\/\/www.microsoft.com\/en-us\/research\/lab\/microsoft-research-asia\/articles\/personalized-recommendation-systems. Accessed: 2024-04-20."},{"key":"e_1_3_2_2_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3582016.3582029"},{"key":"e_1_3_2_2_46_1","first-page":"448","article-title":"TT-Rec: Tensor Train Compression for Deep Learning Recommendation Models","volume":"3","author":"Yin Chunxing","year":"2021","unstructured":"Chunxing Yin, Bilge Acun, Carole-Jean Wu, and Xing Liu. 2021. TT-Rec: Tensor Train Compression for Deep Learning Recommendation Models. Proceedings of Machine Learning and Systems 3 (2021), 448\u2013462.","journal-title":"Proceedings of Machine Learning and Systems"},{"volume-title":"Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates","author":"Zha Daochen","key":"e_1_3_2_2_47_1","unstructured":"Daochen Zha, Louis Feng, Qiaoyu Tan, Zirui Liu, Kwei-Herng Lai, Bhargav Bhushanam, Yuandong Tian, Arun Kejariwal, and Xia Hu. 2022. DreamShard: Generalizable Embedding Table Placement for Recommender Systems. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates, Inc., New York, NY, USA, 15190\u201315203."},{"key":"e_1_3_2_2_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3383313.3412227"},{"key":"e_1_3_2_2_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3358045"},{"key":"e_1_3_2_2_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357384.3358045"},{"key":"e_1_3_2_2_51_1","volume-title":"Article 11378","author":"Zhou Hongde","year":"2023","unstructured":"Hongde Zhou, Fei Xiong, and Hongshu Chen. 2023. A Comprehensive Survey of Recommender Systems Based on Deep Learning. Applied Sciences 13, 20, Article 11378 (2023), 31\u00a0pages."}],"event":{"name":"RecSys '24: 18th ACM Conference on Recommender Systems","sponsor":["SIGWEB ACM Special Interest Group on Hypertext, Hypermedia, and Web","SIGAI ACM Special Interest Group on Artificial Intelligence","SIGKDD ACM Special Interest Group on Knowledge Discovery in Data","SIGIR ACM Special Interest Group on Information Retrieval","SIGCHI ACM Special Interest Group on Computer-Human Interaction"],"location":"Bari Italy","acronym":"RecSys '24"},"container-title":["18th ACM Conference on Recommender Systems"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640457.3688111","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3640457.3688111","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:58:32Z","timestamp":1750294712000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3640457.3688111"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,8]]},"references-count":51,"alternative-id":["10.1145\/3640457.3688111","10.1145\/3640457"],"URL":"https:\/\/doi.org\/10.1145\/3640457.3688111","relation":{},"subject":[],"published":{"date-parts":[[2024,10,8]]},"assertion":[{"value":"2024-10-08","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}