{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T17:24:34Z","timestamp":1768411474049,"version":"3.49.0"},"publisher-location":"New York, NY, USA","reference-count":99,"publisher":"ACM","license":[{"start":{"date-parts":[[2023,10,30]],"date-time":"2023-10-30T00:00:00Z","timestamp":1698624000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2023,10,30]]},"DOI":"10.1145\/3620678.3624661","type":"proceedings-article","created":{"date-parts":[[2023,10,31]],"date-time":"2023-10-31T13:58:07Z","timestamp":1698760687000},"page":"281-297","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":5,"title":["Towards GPU Memory Efficiency for Distributed Training at Scale"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-5058-9405","authenticated-orcid":false,"given":"Runxiang","family":"Cheng","sequence":"first","affiliation":[{"name":"University of Illinois Urbana-Champaign"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3416-8939","authenticated-orcid":false,"given":"Chris","family":"Cai","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc."}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-6071-5085","authenticated-orcid":false,"given":"Selman","family":"Yilmaz","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc."}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-3506-7612","authenticated-orcid":false,"given":"Rahul","family":"Mitra","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc."}]},{"ORCID":"https:\/\/orcid.org\/0009-0001-3272-368X","authenticated-orcid":false,"given":"Malay","family":"Bag","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc."}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8515-0494","authenticated-orcid":false,"given":"Mrinmoy","family":"Ghosh","sequence":"additional","affiliation":[{"name":"Meta Platforms, Inc."}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4443-8170","authenticated-orcid":false,"given":"Tianyin","family":"Xu","sequence":"additional","affiliation":[{"name":"University of Illinois Urbana-Champaign"}]}],"member":"320","published-online":{"date-parts":[[2023,10,31]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00072"},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/CCGrid54584.2022.00079"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1145\/3492321.3519584"},{"key":"e_1_3_2_1_4_1","volume-title":"On the opportunities and risks of foundation models. arXiv:2108.07258","author":"Bommasani R.","year":"2021","unstructured":"Bommasani, R., Hudson, D. A., Adeli, E., Altman, R., Arora, S., von Arx, S., Bernstein, M. S., Bohg, J., Bosselut, A., Brunskill, E., et al. On the opportunities and risks of foundation models. arXiv:2108.07258 (2021)."},{"key":"e_1_3_2_1_5_1","volume-title":"Language models are few-shot learners. NeurIPS","author":"Brown T.","year":"2020","unstructured":"Brown, T., Mann, B., Ryder, N., Subbiah, M., Kaplan, J. D., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., et al. Language models are few-shot learners. NeurIPS (2020)."},{"key":"e_1_3_2_1_6_1","volume-title":"Deep learning research and development platform: Characterizing and scheduling with qos guarantees on gpu clusters. TPDS","author":"Chen Z.","year":"2019","unstructured":"Chen, Z., Quan, W., Wen, M., Fang, J., Yu, J., Zhang, C., and Luo, L. Deep learning research and development platform: Characterizing and scheduling with qos guarantees on gpu clusters. TPDS (2019)."},{"key":"e_1_3_2_1_7_1","volume-title":"DLRS","author":"Cheng H.-T.","year":"2016","unstructured":"Cheng, H.-T., Koc, L., Harmsen, J., Shaked, T., Chandra, T., Aradhye, H., Anderson, G., Corrado, G., Chai, W., Ispir, M., et al. Wide & deep learning for recommender systems. In DLRS (2016)."},{"key":"e_1_3_2_1_8_1","volume-title":"Palm: Scaling language modeling with pathways. arXiv:2204.02311","author":"Chowdhery A.","year":"2022","unstructured":"Chowdhery, A., Narang, S., Devlin, J., Bosma, M., Mishra, G., Roberts, A., Barham, P., Chung, H. W., Sutton, C., Gehrmann, S., et al. Palm: Scaling language modeling with pathways. arXiv:2204.02311 (2022)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/3132747.3132772"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_11_1","volume-title":"Nvidia became a $1 trillion company thanks to the AI boom. https:\/\/www.theverge.com\/2023\/5\/30\/23742123\/nvidia-stock-ai-gpu-1-trillion-market-cap-price-value","author":"Davis W.","year":"2023","unstructured":"Davis, W., and Lawler, R. Nvidia became a $1 trillion company thanks to the AI boom. https:\/\/www.theverge.com\/2023\/5\/30\/23742123\/nvidia-stock-ai-gpu-1-trillion-market-cap-price-value, 2023."},{"key":"e_1_3_2_1_12_1","volume-title":"Large scale distributed deep networks. NeurIPS","author":"Dean J.","year":"2012","unstructured":"Dean, J., Corrado, G., Monga, R., Chen, K., Devin, M., Mao, M., Ranzato, M., Senior, A., Tucker, P., Yang, K., et al. Large scale distributed deep networks. NeurIPS (2012)."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541941"},{"key":"e_1_3_2_1_14_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805","author":"Devlin J.","year":"2018","unstructured":"Devlin, J., Chang, M.-W., Lee, K., and Toutanova, K. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/2736277.2741667"},{"key":"e_1_3_2_1_16_1","volume-title":"Parallel training of pre-trained models via chunk-based dynamic memory management. TPDS","author":"Fang J.","year":"2022","unstructured":"Fang, J., Zhu, Z., Li, S., Su, H., Yu, Y., Zhou, J., and You, Y. Parallel training of pre-trained models via chunk-based dynamic memory management. TPDS (2022)."},{"key":"e_1_3_2_1_17_1","volume-title":"Deep learning workload scheduling in gpu datacenters: Taxonomy, challenges and vision. arXiv:2205.11913","author":"Gao W.","year":"2022","unstructured":"Gao, W., Hu, Q., Ye, Z., Sun, P., Wang, X., Luo, Y., Zhang, T., and Wen, Y. Deep learning workload scheduling in gpu datacenters: Taxonomy, challenges and vision. arXiv:2205.11913 (2022)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3486978"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1145\/3368089.3417050"},{"key":"e_1_3_2_1_20_1","volume-title":"NSDI","author":"Gu J.","year":"2019","unstructured":"Gu, J., Chowdhury, M., Shin, K. G., Zhu, Y., Jeon, M., Qian, J., Liu, H. H., and Guo, C. Tiresias: A gpu cluster manager for distributed deep learning. In NSDI (2019)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_22_1","volume-title":"OpenAI's plans according to Sam Altman. https:\/\/website-754fwhahs-humanloopml.vercel.app\/blog\/open_ai_talk?utm_source=bensbites&utm_medium=newsletter&utm_campaign=openai-s-roadmap\/","author":"Habib R.","year":"2023","unstructured":"Habib, R. OpenAI's plans according to Sam Altman. https:\/\/website-754fwhahs-humanloopml.vercel.app\/blog\/open_ai_talk?utm_source=bensbites&utm_medium=newsletter&utm_campaign=openai-s-roadmap\/, 2023."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1109\/IISWC50251.2020.00024"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476223"},{"key":"e_1_3_2_1_26_1","volume-title":"Language is not all you need: Aligning perception with language models. arXiv:2302.14045","author":"Huang S.","year":"2023","unstructured":"Huang, S., Dong, L., Wang, W., Hao, Y., Singhal, S., Ma, S., Lv, T., Cui, L., Mohammed, O. K., Liu, Q., et al. Language is not all you need: Aligning perception with language models. arXiv:2302.14045 (2023)."},{"key":"e_1_3_2_1_27_1","volume-title":"Gpipe: Efficient training of giant neural networks using pipeline parallelism. NeurIPS","author":"Huang Y.","year":"2019","unstructured":"Huang, Y., Cheng, Y., Bapna, A., Firat, O., Chen, D., Chen, M., Lee, H., Ngiam, J., Le, Q. V., Wu, Y., et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. NeurIPS (2019)."},{"key":"e_1_3_2_1_28_1","volume-title":"NSDI","author":"Hwang C.","year":"2021","unstructured":"Hwang, C., Kim, T., Kim, S., Shin, J., and Park, K. Elastic resource sharing for distributed deep learning. In NSDI (2021)."},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3338906.3338955"},{"key":"e_1_3_2_1_30_1","volume-title":"ATC","author":"Jeon M.","year":"2019","unstructured":"Jeon, M., Venkataraman, S., Phanishatee, A., Qian, J., Xiao, W., and Yang, F. Analysis of large-scale multi-tenant gpu clusters for dnn training workloads. In ATC (2019)."},{"key":"e_1_3_2_1_31_1","volume-title":"ATC","author":"Jia X.","year":"2022","unstructured":"Jia, X., Jiang, L., Wang, A., Xiao, W., Shi, Z., Zhang, J., Li, X., Chen, L., Li, Y., Zheng, Z., et al. Whale: Efficient giant model training over heterogeneous gpus. In ATC (2022)."},{"key":"e_1_3_2_1_32_1","volume-title":"OSDI","author":"Jiang Y.","year":"2020","unstructured":"Jiang, Y., Zhu, Y., Lan, C., Yi, B., Cui, Y., and Guo, C. A unified architecture for accelerating distributed dnn training in heterogeneous gpu\/cpu clusters. In OSDI (2020)."},{"key":"e_1_3_2_1_33_1","volume-title":"Adam: A method for stochastic optimization. arXiv:1412.6980","author":"Kingma D. P.","year":"2014","unstructured":"Kingma, D. P., and Ba, J. Adam: A method for stochastic optimization. arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_34_1","volume-title":"One weird trick for parallelizing convolutional neural networks. arXiv:1404.5997","author":"Krizhevsky A.","year":"2014","unstructured":"Krizhevsky, A. One weird trick for parallelizing convolutional neural networks. arXiv:1404.5997 (2014)."},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1145\/3342195.3387547"},{"key":"e_1_3_2_1_36_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv:2006.15704","author":"Li S.","year":"2020","unstructured":"Li, S., Zhao, Y., Varma, R., Salpekar, O., Noordhuis, P., Li, T., Paszke, A., Smith, J., Vaughan, B., Damania, P., et al. Pytorch distributed: Experiences on accelerating data parallel training. arXiv:2006.15704 (2020)."},{"key":"e_1_3_2_1_37_1","volume-title":"Harmony: Overcoming the hurdles of gpu memory capacity to train massive dnn models on commodity servers. arXiv:2202.01306","author":"Li Y.","year":"2022","unstructured":"Li, Y., Phanishayee, A., Murray, D., Tarnawski, J., and Kim, N. S. Harmony: Overcoming the hurdles of gpu memory capacity to train massive dnn models on commodity servers. arXiv:2202.01306 (2022)."},{"key":"e_1_3_2_1_38_1","volume-title":"ATC","author":"Lim G.","year":"2021","unstructured":"Lim, G., Ahn, J., Xiao, W., Kwon, Y., and Jeon, M. Zico: Efficient gpu memory sharing for concurrent dnn training. In ATC (2021)."},{"key":"e_1_3_2_1_39_1","volume-title":"Tbem: Testing-based gpu-memory consumption estimation for deep learning","author":"Liu H.","year":"2022","unstructured":"Liu, H., Liu, S., Wen, C., and Wong, W. E. Tbem: Testing-based gpu-memory consumption estimation for deep learning. IEEE Access (2022)."},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1145\/3267809.3267830"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS51385.2021.00033"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"publisher","DOI":"10.1145\/3472883.3487003"},{"key":"e_1_3_2_1_43_1","volume-title":"NSDI","author":"Mahajan K.","year":"2020","unstructured":"Mahajan, K., Balasubramanian, A., Singhvi, A., Venkataraman, S., Akella, A., Phanishayee, A., and Chawla, S. Themis: Fair and efficient gpu cluster scheduling. In NSDI (2020)."},{"key":"e_1_3_2_1_44_1","volume-title":"Unified Memory for CUDA Beginners. https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/","author":"Mark Harris","year":"2017","unstructured":"Mark Harris. Unified Memory for CUDA Beginners. https:\/\/developer.nvidia.com\/blog\/unified-memory-cuda-beginners\/, 2017."},{"key":"e_1_3_2_1_45_1","volume-title":"OSDI","author":"Mohan J.","year":"2022","unstructured":"Mohan, J., Phanishayee, A., Kulkarni, J., and Chidambaram, V. Looking beyond gpus for dnn scheduling on multi-tenant clusters. In OSDI (2022)."},{"key":"e_1_3_2_1_46_1","volume-title":"ISCA","author":"Mudigere D.","year":"2022","unstructured":"Mudigere, D., Hao, Y., Huang, J., Jia, Z., Tulloch, A., Sridharan, S., Liu, X., Ozdal, M., Nie, J., Park, J., et al. Software-hardware co-design for fast and scalable training of deep learning recommendation models. In ISCA (2022)."},{"key":"e_1_3_2_1_47_1","volume-title":"Chip Prices Increase By 40% & Stock Shortages Expected Till December. https:\/\/wccftech.com\/nvidia-ai-gpu-demand-blows-up-chip-prices-increase-40-percent-stock-shortages-till-december\/","author":"Mujtaba H.","year":"2023","unstructured":"Mujtaba, H. NVIDIA AI GPU Demand Blows Up, Chip Prices Increase By 40% & Stock Shortages Expected Till December. https:\/\/wccftech.com\/nvidia-ai-gpu-demand-blows-up-chip-prices-increase-40-percent-stock-shortages-till-december\/, 2023."},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_49_1","volume-title":"OSDI","author":"Narayanan D.","year":"2020","unstructured":"Narayanan, D., Santhanam, K., Kazhamiaka, F., Phanishayee, A., and Zaharia, M. Heterogeneity-aware cluster scheduling policies for deep learning workloads. In OSDI (2020)."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_51_1","volume-title":"Deep learning training in facebook data centers: Design of scale-up and scale-out systems. arXiv:2003.09518","author":"Naumov M.","year":"2020","unstructured":"Naumov, M., Kim, J., Mudigere, D., Sridharan, S., Wang, X., Zhao, W., Yilmaz, S., Kim, C., Yuen, H., Ozdal, M., et al. Deep learning training in facebook data centers: Design of scale-up and scale-out systems. arXiv:2003.09518 (2020)."},{"key":"e_1_3_2_1_52_1","volume-title":"Deep learning recommendation model for personalization and recommendation systems. arXiv:1906.00091","author":"Naumov M.","year":"2019","unstructured":"Naumov, M., Mudigere, D., Shi, H.-J. M., Huang, J., Sundaraman, N., Park, J., Wang, X., Gupta, U., Wu, C.-J., Azzolini, A. G., et al. Deep learning recommendation model for personalization and recommendation systems. arXiv:1906.00091 (2019)."},{"key":"e_1_3_2_1_53_1","volume-title":"NVIDIA V100 Tensor Core GPU Datasheet. https:\/\/images.nvidia.com\/content\/technologies\/volta\/pdf\/volta-v100-datasheet-update-us-1165301-r5.pdf\/","author":"NVIDIA.","year":"2018","unstructured":"NVIDIA. NVIDIA V100 Tensor Core GPU Datasheet. https:\/\/images.nvidia.com\/content\/technologies\/volta\/pdf\/volta-v100-datasheet-update-us-1165301-r5.pdf\/, 2018."},{"key":"e_1_3_2_1_54_1","volume-title":"NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl","author":"NVIDIA.","year":"2019","unstructured":"NVIDIA. NVIDIA Collective Communication Library (NCCL). https:\/\/developer.nvidia.com\/nccl, 2019."},{"key":"e_1_3_2_1_55_1","volume-title":"NVIDIA A100 Tensor Core GPU Datasheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf\/","author":"NVIDIA.","year":"2020","unstructured":"NVIDIA. NVIDIA A100 Tensor Core GPU Datasheet. https:\/\/www.nvidia.com\/content\/dam\/en-zz\/Solutions\/Data-Center\/a100\/pdf\/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf\/, 2020."},{"key":"e_1_3_2_1_56_1","volume-title":"CUDA Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf","author":"NVIDIA.","year":"2022","unstructured":"NVIDIA. CUDA Multi-Process Service. https:\/\/docs.nvidia.com\/deploy\/pdf\/CUDA_Multi_Process_Service_Overview.pdf, 2022."},{"key":"e_1_3_2_1_57_1","volume-title":"NVIDIA GH200 Grace Hopper Superchip. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/grace-hopper-superchip","author":"NVIDIA.","year":"2023","unstructured":"NVIDIA. NVIDIA GH200 Grace Hopper Superchip. https:\/\/resources.nvidia.com\/en-us-grace-cpu\/grace-hopper-superchip, 2023."},{"key":"e_1_3_2_1_58_1","unstructured":"Open AI. GPT-4 Technical Report. https:\/\/cdn.openai.com\/papers\/gpt-4.pdf 2023."},{"key":"e_1_3_2_1_59_1","volume-title":"Pytorch: An imperative style, high-performance deep learning library. NeurIPS","author":"Paszke A.","year":"2019","unstructured":"Paszke, A., Gross, S., Massa, F., Lerer, A., Bradbury, J., Chanan, G., Killeen, T., Lin, Z., Gimelshein, N., Antiga, L., et al. Pytorch: An imperative style, high-performance deep learning library. NeurIPS (2019)."},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190517"},{"key":"e_1_3_2_1_61_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/notes\/autograd.html","author":"Autograd","year":"2023","unstructured":"Autograd mechanics. https:\/\/pytorch.org\/docs\/stable\/notes\/autograd.html, 2023."},{"key":"e_1_3_2_1_62_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/autograd.html","author":"Automatic","year":"2023","unstructured":"Automatic differentiation package. https:\/\/pytorch.org\/docs\/stable\/autograd.html, 2023."},{"key":"e_1_3_2_1_63_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/notes\/cuda.html","year":"2023","unstructured":"CUDA semantics. https:\/\/pytorch.org\/docs\/stable\/notes\/cuda.html, 2023."},{"key":"e_1_3_2_1_64_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/generated\/torch.nn.parallel.DistributedDataParallel.html","author":"DistributedDataParallel","year":"2023","unstructured":"DistributedDataParallel. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.nn.parallel.DistributedDataParallel.html, 2023."},{"key":"e_1_3_2_1_65_1","volume-title":"https:\/\/github.com\/pytorch\/pytorch\/blob\/master\/torch\/fx\/passes\/shape_prop.py","author":"Shape","year":"2023","unstructured":"Shape propagation. https:\/\/github.com\/pytorch\/pytorch\/blob\/master\/torch\/fx\/passes\/shape_prop.py, 2023."},{"key":"e_1_3_2_1_66_1","volume-title":"https:\/\/github.com\/pytorch\/pytorch\/blob\/master\/c10\/core\/StorageImpl.h","author":"StorageImpl","year":"2023","unstructured":"StorageImpl. https:\/\/github.com\/pytorch\/pytorch\/blob\/master\/c10\/core\/StorageImpl.h, 2023."},{"key":"e_1_3_2_1_67_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/generated\/torch.cuda.empty_cache.html","year":"2023","unstructured":"torch.cuda.empty_cache. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.cuda.empty_cache.html, 2023."},{"key":"e_1_3_2_1_68_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/generated\/torch.cuda.list_gpu_processes.html","year":"2023","unstructured":"torch.cuda.list_gpu_processes. https:\/\/pytorch.org\/docs\/stable\/generated\/torch.cuda.list_gpu_processes.html, 2023."},{"key":"e_1_3_2_1_69_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/distributed.html#torch.distributed.broadcast_object_list","year":"2023","unstructured":"torch.distributed.broadcast_object_list. https:\/\/pytorch.org\/docs\/stable\/distributed.html#torch.distributed.broadcast_object_list, 2023."},{"key":"e_1_3_2_1_70_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/fx.html","year":"2023","unstructured":"torch.fx. https:\/\/pytorch.org\/docs\/stable\/fx.html, 2023."},{"key":"e_1_3_2_1_71_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/fx.html#torch.fx.Node","author":"Node","year":"2023","unstructured":"torch.fx.Node. https:\/\/pytorch.org\/docs\/stable\/fx.html#torch.fx.Node, 2023."},{"key":"e_1_3_2_1_72_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/optim.html#torch.optim.Optimizer","author":"Optimizer","year":"2023","unstructured":"torch.optim.Optimizer. https:\/\/pytorch.org\/docs\/stable\/optim.html#torch.optim.Optimizer, 2023."},{"key":"e_1_3_2_1_73_1","volume-title":"https:\/\/pytorch.org\/docs\/stable\/profiler.html","year":"2023","unstructured":"torch.profiler. https:\/\/pytorch.org\/docs\/stable\/profiler.html, 2023."},{"key":"e_1_3_2_1_74_1","volume-title":"On the momentum term in gradient descent learning algorithms. Neural Networks","author":"Qian N.","year":"1999","unstructured":"Qian, N. On the momentum term in gradient descent learning algorithms. Neural Networks (1999)."},{"key":"e_1_3_2_1_75_1","volume-title":"Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR","author":"Raffel C.","year":"2020","unstructured":"Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., Zhou, Y., Li, W., and Liu, P. J. Exploring the limits of transfer learning with a unified text-to-text transformer. JMLR (2020)."},{"key":"e_1_3_2_1_76_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1145\/2391229.2391236"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2016.7783721"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507777"},{"key":"e_1_3_2_1_80_1","volume-title":"Meshtensorflow: Deep learning for supercomputers. NeurIPS","author":"Shazeer N.","year":"2018","unstructured":"Shazeer, N., Cheng, Y., Parmar, N., Tran, D., Vaswani, A., Koanantakool, P., Hawkins, P., Lee, H., Hong, M., Young, C., et al. Meshtensorflow: Deep learning for supercomputers. NeurIPS (2018)."},{"key":"e_1_3_2_1_81_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv:1909.08053","author":"Shoeybi M.","year":"2019","unstructured":"Shoeybi, M., Patwary, M., Puri, R., LeGresley, P., Casper, J., and Catanzaro, B. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv:1909.08053 (2019)."},{"key":"e_1_3_2_1_82_1","volume-title":"Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556","author":"Simonyan K.","year":"2014","unstructured":"Simonyan, K., and Zisserman, A. Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556 (2014)."},{"key":"e_1_3_2_1_83_1","volume-title":"Deep learning for recommender systems: A netflix case study. AI Magazine","author":"Steck H.","year":"2021","unstructured":"Steck, H., Baltrunas, L., Elahi, E., Liang, D., Raimond, Y., and Basilico, J. Deep learning for recommender systems: A netflix case study. AI Magazine (2021)."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00044"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.308"},{"key":"e_1_3_2_1_86_1","volume-title":"https:\/\/pytorch.org\/torchrec\/torchrec.distributed.html#torchrec.distributed.model_parallel.DistributedModelParallel","author":"DistributedModelParallel","year":"2023","unstructured":"DistributedModelParallel. https:\/\/pytorch.org\/torchrec\/torchrec.distributed.html#torchrec.distributed.model_parallel.DistributedModelParallel, 2023."},{"key":"e_1_3_2_1_87_1","volume-title":"https:\/\/pytorch.org\/torchrec\/torchrec.distributed.planner.html#torchrec.distributed.planner.planners.EmbeddingShardingPlanner","author":"EmbeddingShardingPlanner","year":"2023","unstructured":"EmbeddingShardingPlanner. https:\/\/pytorch.org\/torchrec\/torchrec.distributed.planner.html#torchrec.distributed.planner.planners.EmbeddingShardingPlanner, 2023."},{"key":"e_1_3_2_1_88_1","volume-title":"https:\/\/github.com\/pytorch\/torchrec","author":"TorchRec","year":"2023","unstructured":"TorchRec. https:\/\/github.com\/pytorch\/torchrec, 2023."},{"key":"e_1_3_2_1_89_1","volume-title":"Attention is all you need. NeurIPS","author":"Vaswani A.","year":"2017","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, \u0141., and Polosukhin, I. Attention is all you need. NeurIPS (2017)."},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"e_1_3_2_1_91_1","volume-title":"OSDI","author":"Xiao W.","year":"2018","unstructured":"Xiao, W., Bhardwaj, R., Ramjee, R., Sivathanu, M., Kwatra, N., Han, Z., Patel, P., Peng, X., Zhao, H., Zhang, Q., et al. Gandiva: Introspective cluster scheduling for deep learning. In OSDI (2018)."},{"key":"e_1_3_2_1_92_1","doi-asserted-by":"publisher","DOI":"10.1145\/2517349.2522727"},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/2791577"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"crossref","unstructured":"Yeung G. Borowiec D. Yang R. Friday A. Harper R. and Garraghan P. Horus: Interference-aware and prediction-based scheduling in deep learning systems. TPDS (2021).","DOI":"10.1109\/TPDS.2021.3079202"},{"key":"e_1_3_2_1_95_1","volume-title":"Fine-grained gpu sharing primitives for deep learning applications. NeurIPS","author":"Yu P.","year":"2020","unstructured":"Yu, P., and Chowdhury, M. Fine-grained gpu sharing primitives for deep learning applications. NeurIPS (2020)."},{"key":"e_1_3_2_1_96_1","volume-title":"Dreamshard: Generalizable embedding table placement for recommender systems. arXiv:2210.02023","author":"Zha D.","year":"2022","unstructured":"Zha, D., Feng, L., Tan, Q., Liu, Z., Lai, K.-H., Bhushanam, B., Tian, Y., Kejariwai, A., and Hu, X. Dreamshard: Generalizable embedding table placement for recommender systems. arXiv:2210.02023 (2022)."},{"key":"e_1_3_2_1_97_1","volume-title":"Dhen: A deep and hierarchical ensemble network for large-scale click-through rate prediction. arXiv:2203.11014","author":"Zhang B.","year":"2022","unstructured":"Zhang, B., Luo, L., Liu, X., Li, J., Chen, Z., Zhang, W., Wei, X., Hao, Y., Tsang, M., Wang, W., et al. Dhen: A deep and hierarchical ensemble network for large-scale click-through rate prediction. arXiv:2203.11014 (2022)."},{"key":"e_1_3_2_1_98_1","volume-title":"Distributed hierarchical gpu parameter server for massive scale deep learning ads systems. MLSys","author":"Zhao W.","year":"2020","unstructured":"Zhao, W., Xie, D., Jia, R., Qian, Y., Ding, R., Sun, M., and Li, P. Distributed hierarchical gpu parameter server for massive scale deep learning ads systems. MLSys (2020)."},{"key":"e_1_3_2_1_99_1","volume-title":"OSDI","author":"Zheng L.","year":"2022","unstructured":"Zheng, L., Li, Z., Zhang, H., Zhuang, Y., Chen, Z., Huang, Y., Wang, Y., Xu, Y., Zhuo, D., Xing, E. P., et al. Alpa: Automating inter-and intra-operator parallelism for distributed deep learning. In OSDI (2022)."}],"event":{"name":"SoCC '23: ACM Symposium on Cloud Computing","location":"Santa Cruz CA USA","acronym":"SoCC '23","sponsor":["SIGMOD ACM Special Interest Group on Management of Data","SIGOPS ACM Special Interest Group on Operating Systems"]},"container-title":["Proceedings of the 2023 ACM Symposium on Cloud Computing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620678.3624661","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620678.3624661","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:56:16Z","timestamp":1755878176000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620678.3624661"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,10,30]]},"references-count":99,"alternative-id":["10.1145\/3620678.3624661","10.1145\/3620678"],"URL":"https:\/\/doi.org\/10.1145\/3620678.3624661","relation":{},"subject":[],"published":{"date-parts":[[2023,10,30]]},"assertion":[{"value":"2023-10-31","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}