{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,1]],"date-time":"2026-01-01T10:04:03Z","timestamp":1767261843606,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,8,9]],"date-time":"2021-08-09T00:00:00Z","timestamp":1628467200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61872135, 61932010"],"award-info":[{"award-number":["61872135, 61932010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Xiangjiang Artificial Intelligence Academy","award":["202021B02"],"award-info":[{"award-number":["202021B02"]}]},{"name":"Zhejiang Lab","award":["2020KC0AC01"],"award-info":[{"award-number":["2020KC0AC01"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,8,9]]},"DOI":"10.1145\/3472456.3472520","type":"proceedings-article","created":{"date-parts":[[2021,10,5]],"date-time":"2021-10-05T18:39:57Z","timestamp":1633459197000},"page":"1-12","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":3,"title":["A Novel Multi-CPU\/GPU Collaborative Computing Framework for SGD-based Matrix Factorization"],"prefix":"10.1145","author":[{"given":"Yizhi","family":"Huang","sequence":"first","affiliation":[{"name":"Hunan University, Zhejiang Lab, China"}]},{"given":"Yanlong","family":"Yin","sequence":"additional","affiliation":[{"name":"Zhejiang Lab, China"}]},{"given":"Yan","family":"Liu","sequence":"additional","affiliation":[{"name":"Hunan University, China"}]},{"given":"Shuibing","family":"He","sequence":"additional","affiliation":[{"name":"Zhejiang University, China"}]},{"given":"Yang","family":"Bai","sequence":"additional","affiliation":[{"name":"Hunan University, China"}]},{"given":"Renfa","family":"Li","sequence":"additional","affiliation":[{"name":"Hunan University, China"}]}],"member":"320","published-online":{"date-parts":[[2021,10,5]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"AMD. 2021. AMD ROCm\u2122 Open Ecosystem. Website. https:\/\/www.amd.com\/en\/graphics\/servers-solutions-rocm.  AMD. 2021. AMD ROCm\u2122 Open Ecosystem. Website. https:\/\/www.amd.com\/en\/graphics\/servers-solutions-rocm."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2668133"},{"key":"e_1_3_2_1_3_1","unstructured":"Intel Coorporation. 2013. An Introduction to the Intel\u00ae QuickPath Interconnect. https:\/\/www.intel.com\/content\/www\/us\/en\/io\/quickpath-technology\/quick-path-interconnect-introduction-paper.html.  Intel Coorporation. 2013. An Introduction to the Intel\u00ae QuickPath Interconnect. https:\/\/www.intel.com\/content\/www\/us\/en\/io\/quickpath-technology\/quick-path-interconnect-introduction-paper.html."},{"key":"e_1_3_2_1_4_1","unstructured":"Intel Coorporation. 2017. Intel\u00ae Performance Counter Monitor - A Better Way to Measure CPU Utilization. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/articles\/intel-performance-counter-monitor.html.  Intel Coorporation. 2017. Intel\u00ae Performance Counter Monitor - A Better Way to Measure CPU Utilization. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/articles\/intel-performance-counter-monitor.html."},{"key":"e_1_3_2_1_5_1","unstructured":"Intel Coorporation. 2019. SECOND GENERATION Intel\u00ae Xeon\u00aeScalable Processors. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/processors\/xeon\/2nd-gen-xeon-scalable-datasheet-vol-1.html.  Intel Coorporation. 2019. SECOND GENERATION Intel\u00ae Xeon\u00aeScalable Processors. https:\/\/www.intel.com\/content\/www\/us\/en\/products\/docs\/processors\/xeon\/2nd-gen-xeon-scalable-datasheet-vol-1.html."},{"key":"e_1_3_2_1_6_1","unstructured":"NVIDIA Coorporation. 2020. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems.  NVIDIA Coorporation. 2020. NVIDIA Nsight Systems. https:\/\/developer.nvidia.com\/nsight-systems."},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/2020408.2020426"},{"key":"e_1_3_2_1_8_1","volume-title":"16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19)","author":"Gu Juncheng","year":"2019","unstructured":"Juncheng Gu , Mosharaf Chowdhury , Kang\u00a0 G Shin , Yibo Zhu , Myeongjae Jeon , Junjie Qian , Hongqiang Liu , and Chuanxiong Guo . 2019 . Tiresias: A GPU cluster manager for distributed deep learning . In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19) . USENIX, Boston, MA, USA, 485\u2013500. Juncheng Gu, Mosharaf Chowdhury, Kang\u00a0G Shin, Yibo Zhu, Myeongjae Jeon, Junjie Qian, Hongqiang Liu, and Chuanxiong Guo. 2019. Tiresias: A GPU cluster manager for distributed deep learning. In 16th USENIX Symposium on Networked Systems Design and Implementation (NSDI 19). USENIX, Boston, MA, USA, 485\u2013500."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600428.2609593"},{"key":"e_1_3_2_1_10_1","unstructured":"Intel. 2021. Intel\u00ae oneAPI Programming Guide. Website. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/documentation\/oneapi-programming-guide\/top.html.  Intel. 2021. Intel\u00ae oneAPI Programming Guide. Website. https:\/\/software.intel.com\/content\/www\/us\/en\/develop\/documentation\/oneapi-programming-guide\/top.html."},{"key":"e_1_3_2_1_11_1","volume-title":"2019 USENIX Annual Technical Conference (USENIX ATC 19)","author":"Jeon Myeongjae","year":"2019","unstructured":"Myeongjae Jeon , Shivaram Venkataraman , Amar Phanishayee , Junjie Qian , Wencong Xiao , and Fan Yang . 2019 . Analysis of large-scale multi-tenant GPU clusters for DNN training workloads . In 2019 USENIX Annual Technical Conference (USENIX ATC 19) . USENIX, Renton, WA, USA, 947\u2013960. Myeongjae Jeon, Shivaram Venkataraman, Amar Phanishayee, Junjie Qian, Wencong Xiao, and Fan Yang. 2019. Analysis of large-scale multi-tenant GPU clusters for DNN training workloads. In 2019 USENIX Annual Technical Conference (USENIX ATC 19). USENIX, Renton, WA, USA, 947\u2013960."},{"key":"e_1_3_2_1_12_1","volume-title":"14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20)","author":"Jiang Yimin","year":"2020","unstructured":"Yimin Jiang , Yibo Zhu , Chang Lan , Bairen Yi , Yong Cui , and Chuanxiong Guo . 2020 . A Unified Architecture for Accelerating Distributed {DNN} Training in Heterogeneous GPU\/CPU Clusters . In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20) . USENIX, virtually, 463\u2013479. Yimin Jiang, Yibo Zhu, Chang Lan, Bairen Yi, Yong Cui, and Chuanxiong Guo. 2020. A Unified Architecture for Accelerating Distributed {DNN} Training in Heterogeneous GPU\/CPU Clusters. In 14th USENIX Symposium on Operating Systems Design and Implementation (OSDI 20). USENIX, virtually, 463\u2013479."},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"publisher","DOI":"10.1002\/cpe.3722"},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959165"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1145\/3184407.3184424"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1007\/s00521-018-3354-z"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2017.2718515"},{"key":"e_1_3_2_1_18_1","volume-title":"Big Learning NIPS Workshop, Vol.\u00a06. Association for Computing Machinery","author":"Li Mu","year":"2013","unstructured":"Mu Li , Li Zhou , Zichao Yang , Aaron Li , Fei Xia , David\u00a0 G Andersen , and Alexander Smola . 2013 . Parameter server for distributed machine learning . In Big Learning NIPS Workshop, Vol.\u00a06. Association for Computing Machinery , San Diego, California, USA, 2. Mu Li, Li Zhou, Zichao Yang, Aaron Li, Fei Xia, David\u00a0G Andersen, and Alexander Smola. 2013. Parameter server for distributed machine learning. In Big Learning NIPS Workshop, Vol.\u00a06. Association for Computing Machinery, San Diego, California, USA, 2."},{"volume-title":"OpenCL programming guide","author":"Munshi Aaftab","key":"e_1_3_2_1_19_1","unstructured":"Aaftab Munshi , Benedict Gaster , Timothy\u00a0 G Mattson , and Dan Ginsburg . 2011. OpenCL programming guide . Addison-Wesley Professional , UK. Aaftab Munshi, Benedict Gaster, Timothy\u00a0G Mattson, and Dan Ginsburg. 2011. OpenCL programming guide. Addison-Wesley Professional, UK."},{"key":"e_1_3_2_1_20_1","unstructured":"J Myeongjae V Shivaram P Amar 2018. Multi-Tenant GPU Clusters for Deep Learning Workloads: Analysis and Implications.  J Myeongjae V Shivaram P Amar 2018. Multi-Tenant GPU Clusters for Deep Learning Workloads: Analysis and Implications."},{"key":"e_1_3_2_1_21_1","unstructured":"Feng Niu Benjamin Recht Christopher Re and Stephen\u00a0J. Wright. 2011. HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. arXiv:arXiv:1106.5730  Feng Niu Benjamin Recht Christopher Re and Stephen\u00a0J. Wright. 2011. HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent. arXiv:arXiv:1106.5730"},{"key":"e_1_3_2_1_22_1","unstructured":"NVIDIA. 2019. CUDA C++ Programming Guide. Website. https:\/\/docs.nvidia.com\/cuda\/archive\/10.2\/cuda-c-programming-guide\/index.html.  NVIDIA. 2019. CUDA C++ Programming Guide. Website. https:\/\/docs.nvidia.com\/cuda\/archive\/10.2\/cuda-c-programming-guide\/index.html."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDM.2012.120"},{"volume-title":"TOP500 Certificates for top ranking systems in the 55th List. Website. https:\/\/www.top500.org\/news\/top500-certificates-top-ranking-systems-55th-list\/","key":"e_1_3_2_1_24_1","unstructured":"TOP500.org. 2020. TOP500 Certificates for top ranking systems in the 55th List. Website. https:\/\/www.top500.org\/news\/top500-certificates-top-ranking-systems-55th-list\/ . TOP500.org. 2020. TOP500 Certificates for top ranking systems in the 55th List. Website. https:\/\/www.top500.org\/news\/top500-certificates-top-ranking-systems-55th-list\/."},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1145\/3225058.3225135"},{"key":"e_1_3_2_1_26_1","volume-title":"Elastic Deep Learning in Multi-Tenant GPU Clusters","author":"Wu Yidi","year":"2021","unstructured":"Yidi Wu , Kaihao Ma , Xiao Yan , Zhi Liu , Zhenkun Cai , Yuzhen Huang , James Cheng , Han Yuan , and Fan Yu. 2021. Elastic Deep Learning in Multi-Tenant GPU Clusters . IEEE Transactions on Parallel and Distributed Systems ( 2021 ), 1\u20131. Yidi Wu, Kaihao Ma, Xiao Yan, Zhi Liu, Zhenkun Cai, Yuzhen Huang, James Cheng, Han Yuan, and Fan Yu. 2021. Elastic Deep Learning in Multi-Tenant GPU Clusters. IEEE Transactions on Parallel and Distributed Systems (2021), 1\u20131."},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1145\/3078597.3078602"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"crossref","unstructured":"Yuanhang Yu Dong Wen Ying Zhang Xiaoyang Wang Wenjie Zhang and Xuemin Lin. 2020. Efficient Matrix Factorization on Heterogeneous CPU-GPU Systems. arXiv:arXiv:2006.15980  Yuanhang Yu Dong Wen Ying Zhang Xiaoyang Wang Wenjie Zhang and Xuemin Lin. 2020. Efficient Matrix Factorization on Heterogeneous CPU-GPU Systems. arXiv:arXiv:2006.15980","DOI":"10.1109\/ICDE51399.2021.00169"},{"key":"e_1_3_2_1_29_1","volume-title":"NOMAD: Non-locking, stOchastic Multi-machine algorithm for Asynchronous and Decentralized matrix completion. arXiv:arXiv:1312.0193","author":"Yun Hyokun","year":"2013","unstructured":"Hyokun Yun , Hsiang-Fu Yu , Cho-Jui Hsieh , S.\u00a0V.\u00a0 N. Vishwanathan , and Inderjit Dhillon . 2013 . NOMAD: Non-locking, stOchastic Multi-machine algorithm for Asynchronous and Decentralized matrix completion. arXiv:arXiv:1312.0193 Hyokun Yun, Hsiang-Fu Yu, Cho-Jui Hsieh, S.\u00a0V.\u00a0N. Vishwanathan, and Inderjit Dhillon. 2013. NOMAD: Non-locking, stOchastic Multi-machine algorithm for Asynchronous and Decentralized matrix completion. arXiv:arXiv:1312.0193"}],"event":{"name":"ICPP 2021: 50th International Conference on Parallel Processing","acronym":"ICPP 2021","location":"Lemont IL USA"},"container-title":["50th International Conference on Parallel Processing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3472456.3472520","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3472456.3472520","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:17:23Z","timestamp":1750191443000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3472456.3472520"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,8,9]]},"references-count":29,"alternative-id":["10.1145\/3472456.3472520","10.1145\/3472456"],"URL":"https:\/\/doi.org\/10.1145\/3472456.3472520","relation":{},"subject":[],"published":{"date-parts":[[2021,8,9]]},"assertion":[{"value":"2021-10-05","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}