{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,2]],"date-time":"2026-01-02T07:46:57Z","timestamp":1767340017424,"version":"3.37.3"},"reference-count":151,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by-nc-nd\/4.0\/"}],"funder":[{"name":"Institute for Information communication Technology Planning and Evaluation","award":["IITP_2024-RS-2024-00437866","RS-2019-II191906","RS-2021-II210310","RS-2021-II210871"],"award-info":[{"award-number":["IITP_2024-RS-2024-00437866","RS-2019-II191906","RS-2021-II210310","RS-2021-II210871"]}]},{"DOI":"10.13039\/501100003725","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["RS-2023-00277080","RS-2024-00415602"],"award-info":[{"award-number":["RS-2023-00277080","RS-2024-00415602"]}],"id":[{"id":"10.13039\/501100003725","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Access"],"published-print":{"date-parts":[[2024]]},"DOI":"10.1109\/access.2024.3465789","type":"journal-article","created":{"date-parts":[[2024,9,23]],"date-time":"2024-09-23T17:29:20Z","timestamp":1727112560000},"page":"142651-142667","source":"Crossref","is-referenced-by-count":2,"title":["Non-Invasive, Memory Access-Triggered Near-Data Processing for DNN Training Acceleration on GPUs"],"prefix":"10.1109","volume":"12","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4989-2834","authenticated-orcid":false,"given":"Hyungkyu","family":"Ham","sequence":"first","affiliation":[{"name":"Department of Computer Science and Engineering, POSTECH, Pohang, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1554-1884","authenticated-orcid":false,"given":"Hyunuk","family":"Cho","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, POSTECH, Pohang, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6369-245X","authenticated-orcid":false,"given":"Minjae","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, Seoul National University, Seoul, South Korea"}]},{"given":"Jueon","family":"Park","sequence":"additional","affiliation":[{"name":"Rebellions Inc., Seongnam, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5492-5346","authenticated-orcid":false,"given":"Jeongmin","family":"Hong","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, POSTECH, Pohang, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3036-6180","authenticated-orcid":false,"given":"Hyojin","family":"Sung","sequence":"additional","affiliation":[{"name":"Graduate School of Data Science, Seoul National University, Seoul, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7331-9819","authenticated-orcid":false,"given":"Eunhyeok","family":"Park","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, POSTECH, Pohang, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8910-533X","authenticated-orcid":false,"given":"Euicheol","family":"Lim","sequence":"additional","affiliation":[{"name":"SK Hynix, Incheon, South Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5749-5794","authenticated-orcid":false,"given":"Gwangsun","family":"Kim","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Engineering, POSTECH, Pohang, South Korea"}]}],"member":"263","reference":[{"volume-title":"How Microsofts Bet on Azure Unlocked an Ai Revolution","year":"2023","author":"Roach","key":"ref1"},{"volume-title":"Introducing the AI Research Supercluster\u2014Metas Cutting-Edge AI Supercomputer for AI Research","year":"2022","author":"Lee","key":"ref2"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00059"},{"key":"ref4","first-page":"6105","article-title":"EfficientNet: Rethinking model scaling for convolutional neural networks","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan"},{"key":"ref5","first-page":"1877","article-title":"Language models are few-shot learners","volume-title":"Proc. NIPS","author":"Brown"},{"volume-title":"AI and Compute","year":"2018","author":"Amodei","key":"ref6"},{"volume-title":"AI and Memory Wall","year":"2021","author":"Gholami","key":"ref7"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00023"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567075"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2016.2616357"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1145\/2541940.2541967"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.42"},{"key":"ref14","first-page":"1","article-title":"Tensor core DL performance guide","volume-title":"Proc. NVIDIA GPU Technol. Conf.","author":"Andersch"},{"volume-title":"Introducing AMD CDNA architecture","year":"2020","key":"ref15"},{"key":"ref16","first-page":"711","article-title":"Data movement is all you need: A case study on optimizing transformers","volume-title":"Proc. Mach. Learn. Syst.","volume":"3","author":"Ivanov"},{"key":"ref17","first-page":"14","article-title":"Restructuring batch normalization to accelerate CNN training","volume-title":"Proc. Mach. Learn. Syst.","volume":"1","author":"Jung"},{"volume-title":"Nvidia a100 tensor core GPU architecture","year":"2020","key":"ref18"},{"volume-title":"AMD Instinct Mi50 Accelerator Support and Drivers","year":"2018","key":"ref19"},{"volume-title":"AMD Instinct MI100 Accelerator","year":"2020","key":"ref20"},{"volume-title":"AMD Instinct MI250x Accelerator","year":"2021","key":"ref21"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750385"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"ref24","first-page":"210","article-title":"Opportunistic computing in GPU architectures","volume-title":"Proc. ACM\/IEEE 46th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Pattnaik"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.27"},{"key":"ref26","first-page":"1","article-title":"Toward standardized near-data processing with unrestricted data placement for GPUs","volume-title":"Proc. Int. Conf. High Perform. Comput., Netw., Storage Anal. (SC)","author":"Kim"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/VLSITechnologyandCir46769.2022.9830518"},{"volume-title":"DDR5\/4 Controller IP","year":"2022","key":"ref28"},{"volume-title":"DDR5\/4 Phy IP for TSMC 7nm","year":"2022","key":"ref29"},{"volume-title":"Samsung Electronics Introduces Industrys First 512GB CXL Memory Module","year":"2022","key":"ref30"},{"volume-title":"Sk Hynix Develops DDR5 Dram CXLTM Memory to Expand the CXL Memory Ecosystem","year":"2022","key":"ref31"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2022.3164651"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO50266.2020.00040"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/isscc42614.2022.9731711"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3097700"},{"article-title":"Instruction sets should be free: The case for risc-v","year":"2014","author":"Asanovic","key":"ref38"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480128"},{"volume-title":"Instruction Sets Want to be Free","year":"2020","author":"Asanovic","key":"ref40"},{"key":"ref41","article-title":"CuDNN: Efficient primitives for deep learning","author":"Chetlur","year":"2014","journal-title":"arXiv:1410.0759"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00018"},{"key":"ref43","first-page":"1","article-title":"NVSwitch and DGX-2","volume-title":"Proc. IEEE Hot Chips 30 Symp. (HCS)","author":"Ishii"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00030"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00049"},{"key":"ref46","first-page":"172","article-title":"Blink: Fast and generic collectives for distributed ML","volume-title":"Proc. Mach. Learn. Syst.","volume":"2","author":"Wang"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00085"},{"key":"ref48","first-page":"1","article-title":"Adam: A method for stochastic optimization","volume-title":"Proc. 3rd Int. Conf. Learn. Represent. (ICLR)","author":"Kingma"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/sc41405.2020.00024"},{"key":"ref50","first-page":"1","article-title":"Mixed precision training","volume-title":"Proc. 6th Int. Conf. Learn. Represent. (ICLR)","author":"Micikevicius"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00080"},{"key":"ref52","first-page":"265","article-title":"TensorFlow: A system for large-scale machine learning","volume-title":"Proc. 12th USENIX Symp. Operating Syst. Design Implement.","author":"Abadi"},{"key":"ref53","first-page":"1","article-title":"PyTorch: An imperative style, high-performance deep learning library","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Paszke"},{"volume-title":"XLA: Optimizing Compiler for Machine Learning","year":"2024","key":"ref54"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.5555\/3045118.3045167"},{"key":"ref56","article-title":"Layer normalization","author":"Ba","year":"2016","journal-title":"arXiv:1607.06450"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1706.03762"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1810.04805"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref63","first-page":"1","article-title":"Very deep convolutional networks for large-scale image recognition","volume-title":"Proc. ICLR","author":"Simonyan"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454083"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507723"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/PACT52795.2021.00008"},{"key":"ref67","first-page":"578","article-title":"TVM: An automated end-to-end optimizing compiler for deep learning","volume-title":"Proc. 13th USENIX Symp. Operating Syst. Design Implement. (OSDI)","author":"Chen"},{"key":"ref68","article-title":"Glow: Graph lowering compiler techniques for neural networks","author":"Rotem","year":"2018","journal-title":"arXiv:1805.00907"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3453483.3454038"},{"volume-title":"Batch Normalization Layer","year":"2024","key":"ref70"},{"key":"ref71","article-title":"Full stack optimization of transformer inference: A survey","author":"Kim","year":"2023","journal-title":"arXiv:2302.14017"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.1145\/3360307"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.12"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA53966.2022.00042"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2017.37"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/hoti55740.2022.00017"},{"volume-title":"System and method for hardware-based GPU paging to system memory","year":"2009","author":"Danilak","key":"ref77"},{"key":"ref78","article-title":"Low-overhead general-purpose near-data processing in CXL memory expanders","author":"Ham","year":"2024","journal-title":"arXiv:2404.19381"},{"volume-title":"Cuda C Programming Guide: Thread Hierarchy","year":"2024","key":"ref79"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080239"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2018.00074"},{"volume-title":"Nvidia Data Center Deep Learning Product Performance","year":"2024","key":"ref82"},{"key":"ref83","doi-asserted-by":"publisher","DOI":"10.1109\/ispass.2019.00041"},{"volume-title":"Cutlass: Cuda Template Library for Dense Linear Algebra at all Levels and Scale","year":"2018","author":"Kerr","key":"ref84"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/HCS52781.2021.9567153"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/MCSE.2021.3057203"},{"volume-title":"Cuda Streams: Best Practices and Common Pitfalls","year":"2014","author":"Luitjens","key":"ref87"},{"volume-title":"Cuda Semantics Memory Management","year":"2024","key":"ref88"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527379"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00047"},{"key":"ref91","doi-asserted-by":"publisher","DOI":"10.1145\/3211346.3211354"},{"key":"ref92","first-page":"387","article-title":"A learned performance model for tensor processing units","volume-title":"Proc. Mach. Learn. Syst.","volume":"3","author":"Kaufman"},{"volume-title":"NVIDIA TensorRT Developer Documentation","year":"2024","key":"ref93"},{"key":"ref94","first-page":"937","article-title":"CAVs: An efficient runtime system for dynamic neural networks","volume-title":"Proc. USENIX Conf. Usenix Annu. Tech. Conf.","author":"Xu"},{"key":"ref95","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2013.6557149"},{"key":"ref96","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2015.2414456"},{"key":"ref97","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480063"},{"key":"ref98","volume-title":"CACTI 6.0: A Tool to Model Large Caches","volume":"27","author":"Muralimanohar","year":"2009"},{"key":"ref99","doi-asserted-by":"publisher","DOI":"10.1109\/NOCS.2012.31"},{"volume-title":"GTC China 2020 Keynote","year":"2020","author":"Dally","key":"ref100"},{"volume-title":"NCCL Tests","year":"2024","key":"ref101"},{"key":"ref102","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358307"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"volume-title":"NVIDIA cuDNN 8.5.0 API: cudnnFusedOps_t","year":"2022","key":"ref104"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895480"},{"key":"ref106","first-page":"10096","article-title":"EfficientNetv2: Smaller models and faster training","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Tan"},{"key":"ref107","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00293"},{"volume-title":"ONNX: Open Neural Network Exchange","year":"2024","key":"ref109"},{"volume-title":"TSMC, not Intel, has the Lead in Semiconductor Processes","year":"2018","author":"Hibben","key":"ref110"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.1109\/TVLSI.2020.3044752"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1109\/ISLPED.2019.8824959"},{"key":"ref113","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00029"},{"key":"ref115","doi-asserted-by":"publisher","DOI":"10.1145\/3445814.3446763"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480080"},{"key":"ref117","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"ref118","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA.2016.41"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00059"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00061"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2018.2876312"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2020.3011265"},{"key":"ref123","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3123977"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/ISLPED.2017.8009163"},{"key":"ref125","doi-asserted-by":"publisher","DOI":"10.1109\/DAC.2018.8465866"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"ref127","first-page":"802","article-title":"FloatPIM: In-memory acceleration of deep neural network training with high precision","volume-title":"Proc. ACM\/IEEE 46th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Imani"},{"key":"ref128","doi-asserted-by":"publisher","DOI":"10.1147\/JRD.2015.2409732"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600213"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.1145\/2967938.2967940"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.37"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1145\/3392717.3392760"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/MM.2021.3088396"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1109\/FPL50879.2020.00014"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1145\/3173162.3173177"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071005"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1145\/3357526.3357532"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00022"},{"key":"ref139","doi-asserted-by":"publisher","DOI":"10.1145\/1555754.1555789"},{"volume-title":"An Introduction to CCIX","year":"2019","key":"ref140"},{"volume-title":"OpenCAPI Overview","year":"2016","key":"ref141"},{"volume-title":"Gen-Z DRAM and Persistent Memory Theory of Operation","year":"2019","author":"Krause","key":"ref142"},{"key":"ref143","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00021"},{"key":"ref144","article-title":"MGPU-TSM: A multi-GPU system with truly shared memory","author":"Mojumder","year":"2020","journal-title":"arXiv:2008.02300"},{"key":"ref145","first-page":"551","article-title":"ZeRO-Offload: Democratizing billion-scale model training","volume-title":"Proc. USENIX Annu. Tech. Conf.","author":"Ren"},{"key":"ref146","first-page":"181","article-title":"Poseidon: An efficient communication architecture for distributed deep learning on gpu clusters","volume-title":"Proc. USENIX Conf. Usenix Annu. Tech. Conf.","author":"Zhang"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1109\/COMHPC.2016.006"},{"key":"ref148","first-page":"829","article-title":"In-network aggregation for shared machine learning clusters","volume-title":"Proc. Mach. Learn. Syst.","volume":"3","author":"Gebara"},{"key":"ref149","first-page":"279","article-title":"Accelerating distributed reinforcement learning with in-switch computing","volume-title":"Proc. ACM\/IEEE 46th Annu. Int. Symp. Comput. Archit. (ISCA)","author":"Li"},{"key":"ref150","first-page":"785","article-title":"Scaling distributed machine learning with in-network aggregation","volume-title":"Proc. 18th USENIX Symp. Netw. Syst. Design Implement. (NSDI)","volume":"21","author":"Sapio"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1109\/HOTI52880.2021.00015"}],"container-title":["IEEE Access"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/6287639\/10380310\/10685403.pdf?arnumber=10685403","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,10]],"date-time":"2024-10-10T15:22:54Z","timestamp":1728573774000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/10685403\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"references-count":151,"URL":"https:\/\/doi.org\/10.1109\/access.2024.3465789","relation":{},"ISSN":["2169-3536"],"issn-type":[{"type":"electronic","value":"2169-3536"}],"subject":[],"published":{"date-parts":[[2024]]}}}