{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T14:43:29Z","timestamp":1777128209847,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":55,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"DOI":"10.13039\/100000001","name":"NSF (National Science Foundation)","doi-asserted-by":"publisher","award":["NSF 2124039"],"award-info":[{"award-number":["NSF 2124039"]}],"id":[{"id":"10.13039\/100000001","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Amazon Faculty Research Award 2021"},{"name":"DOE U.S. Office of Science","award":["66150"],"award-info":[{"award-number":["66150"]}]},{"name":"DOE U.S. Office of Advanced Scientific Computing Research","award":["66150"],"award-info":[{"award-number":["66150"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640406","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"964-979","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["RAP: Resource-aware Automated GPU Sharing for Multi-GPU Recommendation Model Training and Input Preprocessing"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-8575-9432","authenticated-orcid":false,"given":"Zheng","family":"Wang","sequence":"first","affiliation":[{"name":"University of California, San Diego, San Diego, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5914-586X","authenticated-orcid":false,"given":"Yuke","family":"Wang","sequence":"additional","affiliation":[{"name":"University of California, Santa Barbara, Santa Barbara, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-6691-4289","authenticated-orcid":false,"given":"Jiaqi","family":"Deng","sequence":"additional","affiliation":[{"name":"University of California, Santa Barbara, Santa Barbara, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8115-5415","authenticated-orcid":false,"given":"Da","family":"Zheng","sequence":"additional","affiliation":[{"name":"Amazon, Santa Clara, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3734-9137","authenticated-orcid":false,"given":"Ang","family":"Li","sequence":"additional","affiliation":[{"name":"Pacific Northwest National Laboratory, Richland, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8716-5793","authenticated-orcid":false,"given":"Yufei","family":"Ding","sequence":"additional","affiliation":[{"name":"University of California, San Diego, San Diego, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Criteo display ad challenge. https:\/\/www.kaggle.com\/c\/criteodisplay-ad-challenge."},{"key":"e_1_3_2_1_2_1","unstructured":"Nvidia data loading library (dali). https:\/\/developer.nvidia.com\/dali."},{"key":"e_1_3_2_1_3_1","unstructured":"Nvidia merlin hugectr. https:\/\/developer.nvidia.com\/nvidia-merlin\/hugectr."},{"key":"e_1_3_2_1_4_1","unstructured":"Terabyte click logs. https:\/\/labs.criteo.com\/2013\/12\/downloadterabyte-click-logs."},{"key":"e_1_3_2_1_5_1","volume-title":"github.com\/pytorch\/torchrec\/","year":"2022","unstructured":"Torchrec. github.com\/pytorch\/torchrec\/, 2022."},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00072"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.14778\/3485450.3485462"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.14778\/3485450.3485462"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2939672.2939785"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2988450.2988454"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1145\/3337821.3337892"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/n19-1423"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2843948"},{"key":"e_1_3_2_1_14_1","first-page":"689","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Graur Dan","year":"2022","unstructured":"Dan Graur, Damien Aymon, Dan Kluser, Tanguy Albrici, Chandramohan A Thekkath, and Ana Klimovic. Cachew: Machine learning input data processing as a service. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 689--706, 2022. https:\/\/www.usenix.org\/conference\/atc22\/presentation\/graur."},{"key":"e_1_3_2_1_15_1","first-page":"485","volume-title":"NSDI","volume":"19","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu, Mosharaf Chowdhury, Kang G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Harry Liu, and Chuanxiong Guo. Tiresias: A gpu cluster manager for distributed deep learning. In NSDI, volume 19, pages 485--500, 2019. https:\/\/www.usenix.org\/conference\/nsdi19\/presentation\/gu."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_17_1","volume-title":"Gurobi optimizer reference manual","author":"Gurobi Optimization LLC","year":"2021","unstructured":"LLC Gurobi Optimization. Gurobi optimizer reference manual, 2021. https:\/\/www.gurobi.com\/documentation\/current\/refman\/index.html."},{"key":"e_1_3_2_1_18_1","first-page":"539","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Han Mingcong","year":"2022","unstructured":"Mingcong Han, Hanze Zhang, Rong Chen, and Haibo Chen. Microsecond-scale preemption for concurrent gpu-accelerated DNN inferences. In Marcos K. Aguilera and Hakim Weatherspoon, editors, 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022, pages 539--558. USENIX Association, 2022. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/han."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1145\/3038912.3052569"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3514221.3517848"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.14778\/3425879.3425881"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-01970-8_89"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.14778\/3446095.3446100"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533727"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.14778\/3476311.3476374"},{"key":"e_1_3_2_1_28_1","unstructured":"Maxim Naumov Dheevatsa Mudigere Hao-Jun Michael Shi Jianyu Huang Narayanan Sundaraman Jongsoo Park Xiaodong Wang Udit Gupta Carole-Jean Wu Alisson G. Azzolini Dmytro Dzhulgakov Andrey Mallevich Ilia Cherniavskii Yinghai Lu Raghuraman Krishnamoorthi Ansha Yu Volodymyr Kondratenko Stephanie Pereira Xianjie Chen Wenlin Chen Vijay Rao Bill Jia Liang Xiong and Misha Smelyanskiy. Deep learning recommendation model for personalization and recommendation systems. CoRR abs\/1906.00091 2019. http:\/\/arxiv.org\/abs\/1906.00091."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437359.3465586"},{"key":"e_1_3_2_1_30_1","unstructured":"Nvidia. Nvidia dgx a100. www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/nvidia-dgx-a100-datasheet.pdf."},{"key":"e_1_3_2_1_31_1","unstructured":"NVIDIA. Nvidia multi-process service. docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf."},{"key":"e_1_3_2_1_32_1","volume-title":"Cuda c\/c++ streams and concurrency. \"http:\/\/on-demand.gputechconf.com\/gtcexpress\/2011\/presentations\/StreamsAndConcurrencyWebinar.pdf","year":"2011","unstructured":"Nvidia. Cuda c\/c++ streams and concurrency. \"http:\/\/on-demand.gputechconf.com\/gtcexpress\/2011\/presentations\/StreamsAndConcurrencyWebinar.pdf\", 2011."},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00072"},{"key":"e_1_3_2_1_34_1","unstructured":"Apache Parquet. Apache parquet. https:\/\/parquet.apache.org\/."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.14778\/3554821.3554829"},{"key":"e_1_3_2_1_36_1","unstructured":"Pytorch. Torcharrow. https:\/\/pytorch.org\/torcharrow\/beta\/index.html."},{"issue":"8","key":"e_1_3_2_1_37_1","first-page":"9","article-title":"Language models are un-supervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever. Language models are un-supervised multitask learners. OpenAI blog, 1(8):9, 2019. https:\/\/cdn.openai.com\/better-language-models\/language_models_are_unsupervised_multitask_learners.pdf.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_1_40_1","first-page":"821","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022","author":"Sima Chijun","year":"2022","unstructured":"Chijun Sima, Yao Fu, Man-Kit Sit, Liyi Guo, Xuri Gong, Feng Lin, Junyu Wu, Yongsheng Li, Haidong Rong, Pierre-Louis Aublin, and Luo Mai. Ekko: A large-scale deep learning recommender system with low-latency model update. In Marcos K. Aguilera and Hakim Weatherspoon, editors, 16th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2022, Carlsbad, CA, USA, July 11-13, 2022, pages 821--839. USENIX Association, 2022. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/sima."},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/MIC.2017.72"},{"key":"e_1_3_2_1_42_1","unstructured":"TensorFlow. Module: tf.data.experimental.service. https:\/\/www.tensorflow.org\/api_docs\/python\/tf\/data\/experimental\/service."},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.14778\/3579075.3579083"},{"key":"e_1_3_2_1_44_1","volume-title":"Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need. In Isabelle Guyon, Ulrike von Luxburg, Samy Bengio, Hanna M. Wallach, Rob Fergus, S. V. N. Vishwanathan, and Roman Garnett, editors, Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, pages 5998--6008, 2017. https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1137\/130915303"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/GreenCom-CPSCom.2010.102"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3465401"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00075"},{"key":"e_1_3_2_1_49_1","first-page":"98","article-title":"Fine-grained gpu sharing primitives for deep learning applications","volume":"2","author":"Yu Peifeng","year":"2020","unstructured":"Peifeng Yu and Mosharaf Chowdhury. Fine-grained gpu sharing primitives for deep learning applications. Proceedings of Machine Learning and Systems, 2:98--111, 2020. https:\/\/proceedings.mlsys.org\/book\/294.pdf.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3534678.3539034"},{"key":"e_1_3_2_1_51_1","volume-title":"NeurIPS, 2022","author":"Zha Daochen","year":"2022","unstructured":"Daochen Zha, Louis Feng, Qiaoyu Tan, Zirui Liu, Kwei-Herng Lai, Bhargav Bhushanam, Yuandong Tian, Arun Kejariwal, and Xia Hu. Dreamshard: Generalizable embedding table placement for recommender systems. In NeurIPS, 2022. http:\/\/papers.nips.cc\/paper_files\/paper\/2022\/hash\/62302a24b04589f9f9cdd5b02c344b6c-Abstract-Conference.html."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589773"},{"key":"e_1_3_2_1_53_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2211.05239"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.2303.13803"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640406","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/abs\/10.1145\/3620665.3640406","content-type":"text\/html","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640406","content-type":"application\/pdf","content-version":"vor","intended-application":"syndication"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640406","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640406"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":55,"alternative-id":["10.1145\/3620665.3640406","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640406","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}