{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,18]],"date-time":"2026-03-18T13:23:51Z","timestamp":1773840231981,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":86,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/"}],"funder":[{"name":"NRF","award":["2022R1F1A1062826"],"award-info":[{"award-number":["2022R1F1A1062826"]}]},{"name":"IITP","award":["2021-0-01343 and IITP-2023-RS-2023-00256081"],"award-info":[{"award-number":["2021-0-01343 and IITP-2023-RS-2023-00256081"]}]},{"DOI":"10.13039\/501100006374","name":"Samsung","doi-asserted-by":"publisher","award":["IO230216-05037-01"],"award-info":[{"award-number":["IO230216-05037-01"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656595","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"338-351","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":9,"title":["CLAY: CXL-based Scalable NDP Architecture Accelerating Embedding Layers"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5682-2535","authenticated-orcid":false,"given":"Sungmin","family":"Yun","sequence":"first","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2466-9273","authenticated-orcid":false,"given":"Hwayong","family":"Nam","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-4243-2111","authenticated-orcid":false,"given":"Kwanhee","family":"Kyung","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5623-6985","authenticated-orcid":false,"given":"Jaehyun","family":"Park","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3227-2436","authenticated-orcid":false,"given":"Byeongho","family":"Kim","sequence":"additional","affiliation":[{"name":"Samsung Electronics, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1956-4629","authenticated-orcid":false,"given":"Yongsuk","family":"Kwon","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2739-2924","authenticated-orcid":false,"given":"Eojin","family":"Lee","sequence":"additional","affiliation":[{"name":"Inha University, Korea, South (Republic of Korea)"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-1733-1394","authenticated-orcid":false,"given":"Jung Ho","family":"Ahn","sequence":"additional","affiliation":[{"name":"Seoul National University, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Understanding Training Efficiency of Deep Learning Recommendation Models at Scale","author":"Acun Bilge","unstructured":"Bilge Acun, Matthew Murphy, Xiaodong Wang, Jade Nie, Carole-Jean Wu, and Kim Hazelwood. 2021. Understanding Training Efficiency of Deep Learning Recommendation Models at Scale. In HPCA. IEEE, 802\u2013814."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","unstructured":"Akhil Arunkumar Evgeny Bolotin Benjamin Cho Ugljesa Milic Eiman Ebrahimi Oreste Villa Aamer Jaleel Carole-Jean Wu and David Nellans. 2017. MCM-GPU: Multi-Chip-Module GPUs for Continued Performance Scalability. In ISCA. 320\u2013332. https:\/\/doi.org\/10.1145\/3079856.3080231","DOI":"10.1145\/3079856.3080231"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1145\/2366231.2337162"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","unstructured":"Jonathan Chang 2017. A 7nm 256Mb SRAM in High-K Metal-Gate FinFET Technology with Write-Assist Circuitry for Low-VMIN Applications. In ISSCC. 206\u2013207. https:\/\/doi.org\/10.1109\/ISSCC.2017.7870333","DOI":"10.1109\/ISSCC.2017.7870333"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001177"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.mejo.2016.04.006"},{"key":"e_1_3_2_1_8_1","unstructured":"Compute Express\u00a0Link Consortium. 2022. Compute Express Link 3.0 White Paper. https:\/\/www.computeexpresslink.org\/_files\/ugd\/0c1418_a8713008916044ae9604405d10a7773b.pdf"},{"key":"e_1_3_2_1_9_1","unstructured":"Compute Express\u00a0Link Consortium. 2022. Compute Express Link (CXL) specification revision 3.0."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/2959100.2959190"},{"key":"e_1_3_2_1_11_1","unstructured":"CriteoLabs. 2014. Kaggle Display Advertising Challenge Dataset. http:\/\/labs.criteo.com\/2014\/02\/download-kaggle-display-advertising-challenge-dataset\/"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/99.660313"},{"key":"e_1_3_2_1_13_1","volume-title":"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. 2018. BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. arXiv:1810.04805 (2018)."},{"key":"e_1_3_2_1_14_1","unstructured":"Andrea Galimberti Filippo Testa and Alberto Zeni. [n. d.]. RTL Router Design in SystemVerilog. https:\/\/github.com\/agalimberti\/NoCRouter"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","unstructured":"Tong Geng Ang Li Runbin Shi Chunshu Wu Tianqi Wang Yanfei Li Pouya Haghi Antonino Tumeo Shuai Che Steve Reinhardt and Martin\u00a0C. Herbordt. 2020. AWB-GCN: A Graph Convolutional Network Accelerator with Runtime Workload Rebalancing. In MICRO. 922\u2013936. https:\/\/doi.org\/10.1109\/MICRO50266.2020.00079","DOI":"10.1109\/MICRO50266.2020.00079"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Tong Geng Chunshu Wu Yongan Zhang Cheng Tan Chenhao Xie Haoran You Martin Herbordt Yingyan Lin and Ang Li. 2021. I-GCN: A Graph Convolutional Network Accelerator with Runtime Locality Enhancement through Islandization. In MICRO. 1051\u20131063.","DOI":"10.1145\/3466752.3480113"},{"key":"e_1_3_2_1_17_1","volume-title":"Graphite: Optimizing Graph Neural Networks on CPUs Through Cooperative Software-Hardware Techniques. In ISCA. 916\u2013931.","author":"Gong Zhangxiaowen","year":"2022","unstructured":"Zhangxiaowen Gong, Houxiang Ji, Yao Yao, Christopher\u00a0W Fletcher, Christopher\u00a0J Hughes, and Josep Torrellas. 2022. Graphite: Optimizing Graph Neural Networks on CPUs Through Cooperative Software-Hardware Techniques. In ISCA. 916\u2013931."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","unstructured":"Udit Gupta Samuel Hsia Vikram Saraph Xiaodong Wang Brandon Reagen Gu-Yeon Wei Hsien-Hsin\u00a0S. Lee David Brooks and Carole-Jean Wu. 2020. DeepRecSys: A System for Optimizing End-to-End at-Scale Neural Recommendation Inference. In ISCA. 982\u2013995. https:\/\/doi.org\/10.1109\/ISCA45697.2020.00084","DOI":"10.1109\/ISCA45697.2020.00084"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Udit Gupta Samuel Hsia Jeff Zhang Mark Wilkening Javin Pombra Hsien-Hsin\u00a0Sean Lee Gu-Yeon Wei Carole-Jean Wu and David Brooks. 2021. RecPipe: Co-Designing Models and Hardware to Jointly Optimize Recommendation Quality and Performance. In MICRO.","DOI":"10.1145\/3466752.3480127"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","unstructured":"Udit Gupta Carole-Jean Wu Xiaodong Wang Maxim Naumov Brandon Reagen David Brooks Bradford Cottel Kim Hazelwood Mark Hempstead Bill Jia Hsien-Hsin Lee Andrey Malevich Dheevatsa Mudigere Mikhail Smelyanskiy Liang Xiong and Xuan Zhang. 2020. The Architectural Implications of Facebook\u2019s DNN-Based Personalized Recommendation. In HPCA. 488\u2013501. https:\/\/doi.org\/10.1109\/HPCA47549.2020.00047","DOI":"10.1109\/HPCA47549.2020.00047"},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems, Vol.\u00a030. Curran Associates","author":"Hamilton Will","year":"2017","unstructured":"Will Hamilton, Zhitao Ying, and Jure Leskovec. 2017. Inductive Representation Learning on Large Graphs. In Advances in Neural Information Processing Systems, Vol.\u00a030. Curran Associates, Inc.https:\/\/proceedings.neurips.cc\/paper\/2017\/file\/5dd9db5e033da9c6fb5ba83c7a7ebea9-Paper.pdf"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_23_1","volume-title":"Open Graph Benchmark: Datasets for Machine Learning on Graphs. arXiv:2005.00687","author":"Hu Weihua","year":"2020","unstructured":"Weihua Hu, Matthias Fey, Marinka Zitnik, Yuxiao Dong, Hongyu Ren, Bowen Liu, Michele Catasta, and Jure Leskovec. 2020. Open Graph Benchmark: Datasets for Machine Learning on Graphs. arXiv:2005.00687 (2020)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527418"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","unstructured":"Yu Huang Long Zheng Pengcheng Yao Qinggang Wang Xiaofei Liao Hai Jin and Jingling Xue. 2022. Accelerating Graph Convolutional Networks Using Crossbar-based Processing-In-Memory Architectures. In HPCA. 1029\u20131042. https:\/\/doi.org\/10.1109\/HPCA53966.2022.00079","DOI":"10.1109\/HPCA53966.2022.00079"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00057"},{"key":"e_1_3_2_1_27_1","unstructured":"IEEE. 2018. International Roadmap for Devices and Systems: 2018. Technical Report. https:\/\/irds.ieee.org\/editions\/2018\/"},{"key":"e_1_3_2_1_28_1","unstructured":"Intel. 2021. Intel(R) oneAPI Math Kernel Library. https:\/\/github.com\/oneapi-src\/oneMKL"},{"key":"e_1_3_2_1_29_1","unstructured":"JEDEC. 2015. High Bandwidth Memory DRAM(HBM1 HBM2)."},{"key":"e_1_3_2_1_30_1","unstructured":"JEDEC. 2017. DDR4 SDRAM Standard."},{"key":"e_1_3_2_1_31_1","unstructured":"JEDEC. 2020. DDR5 SDRAM Standard."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/VLSIT.2018.8510682"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","unstructured":"Nan Jiang Daniel\u00a0U. Becker George Michelogiannakis James Balfour Brian Towles D.\u00a0E. Shaw John Kim and William\u00a0J. Dally. 2013. A Detailed and Flexible Cycle-Accurate Network-on-Chip Simulator. In ISPASS. 86\u201396. https:\/\/doi.org\/10.1109\/ISPASS.2013.6557149","DOI":"10.1109\/ISPASS.2013.6557149"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","unstructured":"Norman\u00a0P. Jouppi 2017. In-Datacenter Performance Analysis of a Tensor Processing Unit. In ISCA. 1\u201312. https:\/\/doi.org\/10.1145\/3140659.3080246","DOI":"10.1145\/3140659.3080246"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","unstructured":"Norman\u00a0P. Jouppi Doe\u00a0Hyun Yoon Matthew Ashcraft Mark Gottscho Thomas\u00a0B. Jablin George Kurian James Laudon Sheng Li Peter\u00a0C. Ma Xiaoyu Ma Thomas Norrie Nishant Patil Sushma Prasad Cliff Young Zongwei Zhou and David\u00a0A. Patterson. 2021. Ten Lessons From Three Generations Shaped Google\u2019s TPUv4i: Industrial Product. In ISCA. 1\u201314. https:\/\/doi.org\/10.1109\/ISCA52012.2021.00010","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_36_1","volume-title":"SPACE: Locality-Aware Processing in Heterogeneous Memory for Personalized Recommendations. In ISCA.","author":"Kal Hongju","year":"2021","unstructured":"Hongju Kal, Seokmin Lee, Gun Ko, and Won\u00a0Woo Ro. 2021. SPACE: Locality-Aware Processing in Heterogeneous Memory for Personalized Recommendations. In ISCA."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Liu Ke Udit Gupta Benjamin\u00a0Youngjae Cho David Brooks Vikas Chandra Utku Diril Amin Firoozshahian Kim Hazelwood Bill Jia Hsien-Hsin\u00a0S. Lee Meng Li Bert Maher Dheevatsa Mudigere Maxim Naumov Martin Schatz Mikhail Smelyanskiy Xiaodong Wang Brandon Reagen Carole-Jean Wu Mark Hempstead and Xuan Zhang. 2020. RecNMP: Accelerating Personalized Recommendation with Near-Memory Processing. In ISCA. 790\u2013803.","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_1_38_1","volume-title":"Hercules: Heterogeneity-Aware Inference Serving for At-Scale Personalized Recommendation. In HPCA.","author":"Ke Liu","year":"2022","unstructured":"Liu Ke, Udit Gupta, Mark Hempsteadis, Carole-Jean Wu, Hsien-Hsin\u00a0S. Lee, and Xuan Zhang. 2022. Hercules: Heterogeneity-Aware Inference Serving for At-Scale Personalized Recommendation. In HPCA."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3097700"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2015.2414456"},{"key":"e_1_3_2_1_41_1","unstructured":"Kioxia. 2023. Introducing the EDSFF E3 Family of Form Factors. https:\/\/americas.kioxia.com\/content\/dam\/kioxia\/en-us\/business\/ssd\/data-center-ssd\/asset\/KIOXIA_EDSFF_E3_Intro_White_Paper.pdf"},{"key":"e_1_3_2_1_42_1","volume-title":"Kipf and Max Welling","author":"N.","year":"2016","unstructured":"Thomas\u00a0N. Kipf and Max Welling. 2016. Semi-supervised Classification with Graph Convolutional Networks. arXiv:1609.02907 (2016)."},{"key":"e_1_3_2_1_43_1","unstructured":"Alex Krizhevsky Ilya Sutskever and Geoffrey\u00a0E Hinton. 2012. ImageNet Classification with Deep Convolutional Neural Networks. In Advances in Neural Information Processing Systems Vol.\u00a025."},{"key":"e_1_3_2_1_44_1","unstructured":"H.T. Kung and C.E. Leiserson. 1980. Algorithms for VLSI processor arrays. Introduction to VLSI systems (1980) 271\u2013292."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Daehyun Kwon Heon\u00a0Su Jeong Jaemin Choi Wijong Kim Jae\u00a0Woong Kim Junsub Yoon Jungmin Choi Sanguk Lee Hyunsub\u00a0Norbert Rie Jin-il Lee 2023. 28.7 A 1.1 V 6.4 Gb\/s\/pin 24-Gb DDR5 SDRAM with a Highly-Accurate Duty Corrector and NBTI-Tolerant DLL. In ISSCC. IEEE 27\u201329.","DOI":"10.1109\/ISSCC42615.2023.10067651"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","unstructured":"Youngeun Kwon Yunjae Lee and Minsoo Rhu. 2019. TensorDIMM: A Practical Near-Memory Processing Architecture for Embeddings and Tensor Operations in Deep Learning. In MICRO. 740\u2013753. https:\/\/doi.org\/10.1145\/3352460.3358284","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00029"},{"key":"e_1_3_2_1_48_1","doi-asserted-by":"crossref","unstructured":"Youngeun Kwon and Minsoo Rhu. 2022. Training Personalized Recommendation Systems from (GPU) Scratch: Look Forward not Backwards. In ISCA.","DOI":"10.1145\/3470496.3527386"},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Yunjae Lee Jinha Chung and Minsoo Rhu. 2022. SmartSAGE: Training Large-scale Graph Neural Networks using In-Storage Processing Architectures. In ISCA. 932\u2013945.","DOI":"10.1145\/3470496.3527391"},{"key":"e_1_3_2_1_50_1","unstructured":"Jure Leskovec and Andrej Krevl. 2014. SNAP Datasets: Stanford Large Network Dataset Collection. http:\/\/snap.stanford.edu\/data."},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3578835"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00070"},{"key":"e_1_3_2_1_53_1","volume-title":"RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. RoBERTa: A Robustly Optimized BERT Pretraining Approach. arXiv:1907.11692 (2019)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2023.3305668"},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISBI.2008.4541126"},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/1064978.1065034"},{"key":"e_1_3_2_1_57_1","unstructured":"Micron. 2017. Calculating Memory Power for DDR4 SDRAM. https:\/\/www.micron.com\/support\/tools-and-utilities\/power-calc"},{"key":"e_1_3_2_1_58_1","unstructured":"Micron. 2023. 16Gb DDR5 SDRAM Addendum. https:\/\/media-www.micron.com\/-\/media\/client\/global\/documents\/products\/data-sheet\/dram\/ddr5\/16gb_ddr5_sdram_diereva.pdf"},{"key":"e_1_3_2_1_59_1","unstructured":"Dheevatsa Mudigere 2022. Software-Hardware Co-design for Fast and Scalable Training of Deep Learning Recommendation Models. In ISCA."},{"key":"e_1_3_2_1_60_1","volume-title":"Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv:1906.00091","author":"Maxim Naumov","year":"2019","unstructured":"Maxim Naumov 2019. Deep Learning Recommendation Model for Personalization and Recommendation Systems. arXiv:1906.00091 (2019)."},{"key":"e_1_3_2_1_61_1","unstructured":"NVIDIA. 2017. NVIDIA V100. https:\/\/www.nvidia.com\/en-us\/data-center\/v100\/"},{"key":"e_1_3_2_1_62_1","unstructured":"NVIDIA. 2020. NVIDIA A100. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/"},{"key":"e_1_3_2_1_63_1","unstructured":"NVIDIA. 2022. NVIDIA H100. https:\/\/www.nvidia.com\/en-us\/data-center\/h100\/"},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","unstructured":"Jaehyun Park Byeongho Kim Sungmin Yun Eojin Lee Minsoo Rhu and Jung Ho Ahn. 2021. TRiM: Enhancing Processor-Memory Interfaces with Scalable Tensor Reduction in Memory. In MICRO. 268\u2013281. https:\/\/doi.org\/10.1145\/3466752.3480080","DOI":"10.1145\/3466752.3480080"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA57654.2024.00078"},{"key":"e_1_3_2_1_66_1","volume-title":"FinCACTI: Architectural Analysis and Modeling of Caches with Deeply-Scaled FinFET Devices. In 2014 IEEE Computer Society Annual Symposium on VLSI. IEEE, 290\u2013295","author":"Shafaei Alireza","year":"2014","unstructured":"Alireza Shafaei, Yanzhi Wang, Xue Lin, and Massoud Pedram. 2014. FinCACTI: Architectural Analysis and Modeling of Caches with Deeply-Scaled FinFET Devices. In 2014 IEEE Computer Society Annual Symposium on VLSI. IEEE, 290\u2013295."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2023.3235972"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","unstructured":"Taejoong Song 2018. A 7nm FinFET SRAM Using EUV Lithography with Dual Write-Driver-Assist Circuitry for Low-Voltage Applications. In ISSCC. 198\u2013200. https:\/\/doi.org\/10.1109\/ISSCC.2018.8310252","DOI":"10.1109\/ISSCC.2018.8310252"},{"key":"e_1_3_2_1_69_1","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2014.2376131"},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"crossref","unstructured":"Xuan Sun Hu Wan Qiao Li Chia-Lin Yang Tei-Wei Kuo and Chun\u00a0Jason Xue. 2022. RM-SSD: In-Storage Computing for Large-Scale Recommendation Inference. In HPCA.","DOI":"10.1109\/HPCA53966.2022.00081"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Yan Sun Yifan Yuan Zeduo Yu Reese Kuper Chihun Song Jinghan Huang Houxiang Ji Siddharth Agarwal Jiaqi Lou Ipoom Jeong Ren Wang Jung\u00a0Ho Ahn Tianyin Xu and Nam\u00a0Sung Kim. 2023. Demystifying CXL Memory with Genuine CXL-Ready Systems and Devices. In MICRO.","DOI":"10.1145\/3613424.3614256"},{"key":"e_1_3_2_1_72_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.sysarc.2022.102602"},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1109\/CICC.2018.8357077"},{"key":"e_1_3_2_1_74_1","unstructured":"Ashish Vaswani Noam Shazeer Niki Parmar Jakob Uszkoreit Llion Jones Aidan\u00a0N Gomez Lukasz Kaiser and Illia Polosukhin. 2017. Attention is All You Need. In Advances in Neural Information Processing Systems Vol.\u00a030."},{"key":"e_1_3_2_1_75_1","first-page":"673","article-title":"BNS-GCN: Efficient Full-Graph Training of Graph Convolutional Networks with Partition-Parallelism and Random Boundary Node Sampling","volume":"4","author":"Wan Cheng","year":"2022","unstructured":"Cheng Wan, Youjie Li, Ang Li, Nam\u00a0Sung Kim, and Yingyan Lin. 2022. BNS-GCN: Efficient Full-Graph Training of Graph Convolutional Networks with Partition-Parallelism and Random Boundary Node Sampling. Proceedings of Machine Learning and Systems 4 (2022), 673\u2013693.","journal-title":"Proceedings of Machine Learning and Systems"},{"key":"e_1_3_2_1_76_1","volume-title":"GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation. 515\u2013531","author":"Wang Yuke","year":"2021","unstructured":"Yuke Wang, Boyuan Feng, Gushu Li, Shuangchen Li, Lei Deng, Yuan Xie, and Yufei Ding. 2021. GNNAdvisor: An Adaptive and Efficient Runtime System for GNN Acceleration on GPUs. In 15th USENIX Symposium on Operating Systems Design and Implementation. 515\u2013531."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","unstructured":"Mark Wilkening Udit Gupta Samuel Hsia Caroline Trippel Carole-Jean Wu David Brooks and Gu-Yeon Wei. 2021. RecSSD: Near Data Processing for Solid State Drive Based Recommendation Inference. In ASPLOS. 717\u2013729. https:\/\/doi.org\/10.1145\/3445814.3446763","DOI":"10.1145\/3445814.3446763"},{"key":"e_1_3_2_1_78_1","doi-asserted-by":"publisher","DOI":"10.1109\/IEDM.2016.7838333"},{"key":"e_1_3_2_1_79_1","volume-title":"International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ryGs6iA5Km","author":"Xu Keyulu","year":"2019","unstructured":"Keyulu Xu, Weihua Hu, Jure Leskovec, and Stefanie Jegelka. 2019. How Powerful are Graph Neural Networks?. In International Conference on Learning Representations. https:\/\/openreview.net\/forum?id=ryGs6iA5Km"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","unstructured":"Mingyu Yan Lei Deng Xing Hu Ling Liang Yujing Feng Xiaochun Ye Zhimin Zhang Dongrui Fan and Yuan Xie. 2020. HyGCN: A GCN Accelerator with Hybrid Architecture. In HPCA. 15\u201329. https:\/\/doi.org\/10.1109\/HPCA47549.2020.00012","DOI":"10.1109\/HPCA47549.2020.00012"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","unstructured":"Haoran You Tong Geng Yongan Zhang Ang Li and Yingyan Lin. 2022. GCoD: Graph Convolutional Network Acceleration via Dedicated Algorithm and Accelerator Co-Design. In HPCA. 460\u2013474. https:\/\/doi.org\/10.1109\/HPCA53966.2022.00041","DOI":"10.1109\/HPCA53966.2022.00041"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2022.3182387"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2023.3283677"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"crossref","unstructured":"Mark Zhao Niket Agarwal Aarti Basant Bu\u011fra Gedik Satadru Pan Mustafa Ozdal Rakesh Komuravelli Jerry Pan Tianshu Bao Haowei Lu Sundaram Narayanan Jack Langman Kevin Wilfong Harsha Rastogi Carole-Jean Wu Christos Kozyrakis and Parik Pol. 2022. nderstanding Data Storage and Ingestion for Large-Scale Deep Recommendation Model Training: Industrial Product. In ISCA.","DOI":"10.1145\/3470496.3533044"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/3219819.3219823"},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Zhe Zhou Cong Li Xuechao Wei Xiaoyang Wang and Guangyu Sun. 2022. GNNear: Accelerating Full-Batch Training of Graph Neural Networks with Near-Memory Processing. In PACT.","DOI":"10.1145\/3559009.3569670"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","location":"Kyoto Japan","acronym":"ICS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656595","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656595","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:25:17Z","timestamp":1755876317000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656595"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":86,"alternative-id":["10.1145\/3650200.3656595","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656595","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}