{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,11]],"date-time":"2026-03-11T01:52:23Z","timestamp":1773193943489,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":212,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T00:00:00Z","timestamp":1728777600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"European Union?s Horizon programme for research and innovation [101047160 - BioPIM]"},{"name":"ACCESS ? AI Chip Center for Emerging Smart Systems, sponsored by InnoHK funding, Hong Kong SAR"},{"name":"Semiconductor Research Corporation (SRC)"},{"name":"ETH Future Computing Laboratory (EFCL)"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,14]]},"DOI":"10.1145\/3656019.3676947","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T10:34:08Z","timestamp":1728642848000},"page":"201-218","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":18,"title":["PIM-Opt: Demystifying Distributed Optimization Algorithms on a Real-World Processing-In-Memory System"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0000-3389-5064","authenticated-orcid":false,"given":"Steve","family":"Rhyner","sequence":"first","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0849-8724","authenticated-orcid":false,"given":"Haocong","family":"Luo","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-6514-1571","authenticated-orcid":false,"given":"Juan","family":"G\u00f3mez-Luna","sequence":"additional","affiliation":[{"name":"NVIDIA, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4029-0175","authenticated-orcid":false,"given":"Mohammad","family":"Sadrosadati","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0051-0046","authenticated-orcid":false,"given":"Jiawei","family":"Jiang","sequence":"additional","affiliation":[{"name":"Wuhan University, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5333-5726","authenticated-orcid":false,"given":"Ataberk","family":"Olgun","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6704-0473","authenticated-orcid":false,"given":"Harshita","family":"Gupta","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8105-7505","authenticated-orcid":false,"given":"Ce","family":"Zhang","sequence":"additional","affiliation":[{"name":"University of Chicago, USA"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0075-2312","authenticated-orcid":false,"given":"Onur","family":"Mutlu","sequence":"additional","affiliation":[{"name":"ETH Zurich, Switzerland"}]}],"member":"320","published-online":{"date-parts":[[2024,10,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"Junwhan Ahn Sungpack Hong Sungjoo Yoo Onur Mutlu and Kiyoung Choi. 2015. A Scalable Processing-In-Memory Accelerator for Parallel Graph Processing. In ISCA."},{"key":"e_1_3_2_1_2_1","unstructured":"Junwhan Ahn Sungjoo Yoo Onur Mutlu and Kiyoung Choi. 2015. PIM-Enabled Instructions: A Low-Overhead Locality-Aware Processing-In-Memory Architecture. In ISCA."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"crossref","unstructured":"Berkin Akin Franz Franchetti and James\u00a0C Hoe. 2015. Data Reorganization in Memory Using 3D-Stacked DRAM. In ISCA.","DOI":"10.1145\/2749469.2750397"},{"key":"e_1_3_2_1_4_1","volume-title":"QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding. In NeurIPS.","author":"Alistarh Dan","year":"2017","unstructured":"Dan Alistarh, Demjan Grubic, Jerry Li, Ryota Tomioka, and Milan Vojnovic. 2017. QSGD: Communication-Efficient SGD via Gradient Quantization and Encoding. In NeurIPS."},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"crossref","unstructured":"Giuseppe Amato Fabrizio Falchi Claudio Gennaro and Fausto Rabitti. 2016. YFCC100M-HNfc6: A Large-scale Deep Features Benchmark for Similarity Search. In SISAP.","DOI":"10.1007\/978-3-319-46759-7_15"},{"key":"e_1_3_2_1_6_1","unstructured":"AMD. 2019. AMD EPYC 7742. https:\/\/www.amd.com\/en\/support\/downloads\/drivers.html\/processors\/epyc\/epyc-7002-series\/amd-epyc-7742.html."},{"key":"e_1_3_2_1_7_1","volume-title":"FAFNIR: Accelerating Sparse Gathering by Using Efficient Near-Memory Intelligent Reduction. In HPCA.","author":"Asgari Bahar","year":"2021","unstructured":"Bahar Asgari, Ramyad Hadidi, Jiashen Cao, Sung-Kyu Lim, and Hyesoon Kim. 2021. FAFNIR: Accelerating Sparse Gathering by Using Efficient Near-Memory Intelligent Reduction. In HPCA."},{"key":"e_1_3_2_1_8_1","volume-title":"Chameleon: Versatile and practical near-DRAM acceleration architecture for large memory systems. In MICRO.","author":"Asghari-Moghaddam Hadi","year":"2016","unstructured":"Hadi Asghari-Moghaddam, Young\u00a0Hoon Son, Jung\u00a0Ho Ahn, and Nam\u00a0Sung Kim. 2016. Chameleon: Versatile and practical near-DRAM acceleration architecture for large memory systems. In MICRO."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/TSP.2009.2016247"},{"key":"e_1_3_2_1_10_1","volume-title":"Neurostream: Scalable and Energy Efficient Deep Learning with Smart Memory Cubes. TPDS","author":"Azarkhish Erfan","year":"2017","unstructured":"Erfan Azarkhish, Davide Rossi, Igor Loi, and Luca Benini. 2017. Neurostream: Scalable and Energy Efficient Deep Learning with Smart Memory Cubes. TPDS (2017)."},{"key":"e_1_3_2_1_11_1","volume-title":"Accelerating Large Table Scan Using Processing-In-Memory Technology. Datenbank-Spektrum","author":"Baumstark Alexander","year":"2023","unstructured":"Alexander Baumstark, Muhammad\u00a0Attahir Jibril, and Kai-Uwe Sattler. 2023. Accelerating Large Table Scan Using Processing-In-Memory Technology. Datenbank-Spektrum (2023)."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"crossref","unstructured":"Alexander Baumstark Muhammad\u00a0Attahir Jibril and Kai-Uwe Sattler. 2023. Adaptive Query Compilation with Processing-in-Memory. In ICDEW.","DOI":"10.21203\/rs.3.rs-2223228\/v1"},{"key":"e_1_3_2_1_13_1","doi-asserted-by":"crossref","unstructured":"Arthur Bernhardt Andreas Koch and Ilia Petrov. 2023. pimDB: From Main-Memory DBMS to Processing-In-Memory DBMS-Engines on Intelligent Memories. In DaMoN.","DOI":"10.1145\/3592980.3595312"},{"key":"e_1_3_2_1_14_1","volume-title":"SISA: Set-Centric Instruction Set Architecture for Graph Mining on Processing-in-Memory Systems. In MICRO.","author":"Besta Maciej","year":"2021","unstructured":"Maciej Besta, Raghavendra Kanakagiri, Grzegorz Kwasniewski, Rachata Ausavarungnirun, Jakub Ber\u00e1nek, Konstantinos Kanellopoulos, Kacper Janda, Zur Vonarburg-Shmaria, Lukas Gianinazzi, Ioana Stefan, 2021. SISA: Set-Centric Instruction Set Architecture for Graph Mining on Processing-in-Memory Systems. In MICRO."},{"key":"e_1_3_2_1_15_1","volume-title":"Practical Mechanisms for Reducing Processor\u2013Memory Data Movement in Modern Workloads. Ph.\u00a0D. Dissertation","author":"Boroumand Amirali","unstructured":"Amirali Boroumand. 2020. Practical Mechanisms for Reducing Processor\u2013Memory Data Movement in Modern Workloads. Ph.\u00a0D. Dissertation. Carnegie Mellon University."},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"crossref","unstructured":"Amirali Boroumand Saugata Ghose Berkin Akin Ravi Narayanaswami Geraldo\u00a0F Oliveira Xiaoyu Ma Eric Shiu and Onur Mutlu. 2021. Google Neural Network Models for Edge Devices: Analyzing and Mitigating Machine Learning Inference Bottlenecks. In PACT.","DOI":"10.1109\/PACT52795.2021.00019"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"crossref","unstructured":"Amirali Boroumand Saugata Ghose Youngsok Kim Rachata Ausavarungnirun Eric Shiu Rahul Thakur Daehyun Kim Aki Kuusela Allan Knies Parthasarathy Ranganathan 2018. Google Workloads for Consumer Devices: Mitigating Data Movement Bottlenecks. In ASPLOS.","DOI":"10.1145\/3173162.3173177"},{"key":"e_1_3_2_1_18_1","volume-title":"Polynesia: Enabling Effective Hybrid Transactional\/Analytical Databases with Specialized Hardware\/Software Co-Design. arXiv:2103.00798","author":"Boroumand Amirali","year":"2021","unstructured":"Amirali Boroumand, Saugata Ghose, Geraldo\u00a0F Oliveira, and Onur Mutlu. 2021. Polynesia: Enabling Effective Hybrid Transactional\/Analytical Databases with Specialized Hardware\/Software Co-Design. arXiv:2103.00798 (2021)."},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"crossref","unstructured":"Amirali Boroumand Saugata Ghose Minesh Patel Hasan Hassan Brandon Lucia Rachata Ausavarungnirun Kevin Hsieh Nastaran Hajinazar Krishna\u00a0T Malladi Hongzhong Zheng 2019. CoNDA: Efficient Cache Coherence Support for Near-Data Accelerators. In ISCA.","DOI":"10.1145\/3307650.3322266"},{"key":"e_1_3_2_1_20_1","volume-title":"LazyPIM: An Efficient Cache Coherence Mechanism for Processing-in-Memory. ICAL","author":"Boroumand Amirali","year":"2016","unstructured":"Amirali Boroumand, Saugata Ghose, Minesh Patel, Hasan Hassan, Brandon Lucia, Kevin Hsieh, Krishna\u00a0T Malladi, Hongzhong Zheng, and Onur Mutlu. 2016. LazyPIM: An Efficient Cache Coherence Mechanism for Processing-in-Memory. ICAL (2016)."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"crossref","unstructured":"Bernhard\u00a0E Boser Isabelle\u00a0M Guyon and Vladimir\u00a0N Vapnik. 1992. A Training Algorithm for Optimal Margin Classifiers. In COLT.","DOI":"10.1145\/130385.130401"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"crossref","unstructured":"L\u00e9on Bottou. 2010. Large-Scale Machine Learning with Stochastic Gradient Descent. In COMPSTAT.","DOI":"10.1007\/978-3-7908-2604-3_16"},{"key":"e_1_3_2_1_23_1","volume-title":"Stochastic Gradient Descent Tricks","author":"Bottou L\u00e9on","unstructured":"L\u00e9on Bottou. 2012. Stochastic Gradient Descent Tricks. Springer."},{"key":"e_1_3_2_1_24_1","volume-title":"Gossip Algorithms: Design, Analysis and Applications. In INFOCOM.","author":"Boyd Stephen","year":"2005","unstructured":"Stephen Boyd, Arpita Ghosh, Balaji Prabhakar, and Devavrat Shah. 2005. Gossip Algorithms: Design, Analysis and Applications. In INFOCOM."},{"key":"e_1_3_2_1_25_1","volume-title":"Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers. Foundations and Trends\u00ae in Machine Learning","author":"Boyd Stephen","year":"2011","unstructured":"Stephen Boyd, Neal Parikh, Eric Chu, Borja Peleato, Jonathan Eckstein, 2011. Distributed Optimization and Statistical Learning via the Alternating Direction Method of Multipliers. Foundations and Trends\u00ae in Machine Learning (2011)."},{"key":"e_1_3_2_1_26_1","volume-title":"Convex Optimization","author":"Boyd P","unstructured":"Stephen\u00a0P Boyd and Lieven Vandenberghe. 2004. Convex Optimization. Cambridge University Press."},{"key":"e_1_3_2_1_27_1","unstructured":"Damla\u00a0Senol Cali Gurpreet\u00a0S Kalsi Z\u00fclal Bing\u00f6l Can Firtina Lavanya Subramanian Jeremie\u00a0S Kim Rachata Ausavarungnirun Mohammed Alser Juan Gomez-Luna Amirali Boroumand 2020. GenASM: A High-Performance Low-Power Approximate String Matching Acceleration Framework for Genome Sequence Analysis. In MICRO."},{"key":"e_1_3_2_1_28_1","volume-title":"Gossip Consensus Algorithms via Quantized Communication. Automatica","author":"Carli Ruggero","year":"2010","unstructured":"Ruggero Carli, Fabio Fagnani, Paolo Frasca, and Sandro Zampieri. 2010. Gossip Consensus Algorithms via Quantized Communication. Automatica (2010)."},{"key":"e_1_3_2_1_29_1","volume-title":"Understanding and Improving the Latency of DRAM-based Memory Systems. Ph.\u00a0D. Dissertation","author":"Chang K","unstructured":"Kevin\u00a0K Chang. 2017. Understanding and Improving the Latency of DRAM-based Memory Systems. Ph.\u00a0D. Dissertation. Carnegie Mellon University."},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"crossref","unstructured":"Kevin\u00a0K Chang Prashant\u00a0J Nair Donghyuk Lee Saugata Ghose Moinuddin\u00a0K Qureshi and Onur Mutlu. 2016. Low-Cost Inter-Linked Subarrays (LISA): Enabling fast inter-subarray data movement in DRAM. In HPCA.","DOI":"10.1109\/HPCA.2016.7446095"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"crossref","unstructured":"Jinfan Chen Juan G\u00f3mez-Luna Izzat El\u00a0Hajj Yuxin Guo and Onur Mutlu. 2023. SimplePIM: A Software Framework for Productive and Efficient Processing-in-Memory. In PACT.","DOI":"10.1109\/PACT58117.2023.00017"},{"key":"e_1_3_2_1_32_1","unstructured":"Liang-Chi Chen Chien-Chung Ho and Yuan-Hao Chang. 2023. UpPipe: A Novel Pipeline Management on In-Memory Processors for RNA-seq Quantification. In DAC."},{"key":"e_1_3_2_1_33_1","volume-title":"PRIME: A Novel Processing-In-Memory Architecture for Neural Network Computation in ReRAM-based Main Memory. In ISCA.","author":"Chi Ping","year":"2016","unstructured":"Ping Chi, Shuangchen Li, Cong Xu, Tao Zhang, Jishen Zhao, Yongpan Liu, Yu Wang, and Yuan Xie. 2016. PRIME: A Novel Processing-In-Memory Architecture for Neural Network Computation in ReRAM-based Main Memory. In ISCA."},{"key":"e_1_3_2_1_34_1","volume-title":"A Fast Parallel Stochastic Gradient Method for Matrix Factorization in Shared Memory Systems. TIST","author":"Chin Wei-Sheng","year":"2015","unstructured":"Wei-Sheng Chin, Yong Zhuang, Yu-Chin Juan, and Chih-Jen Lin. 2015. A Fast Parallel Stochastic Gradient Method for Matrix Factorization in Shared Memory Systems. TIST (2015)."},{"key":"e_1_3_2_1_35_1","unstructured":"Wei-Sheng Chin Yong Zhuang Yu-Chin Juan and Chih-Jen Lin. 2015. A Learning-Rate Schedule for Stochastic Gradient Methods to Matrix Factorization. In PAKDD."},{"key":"e_1_3_2_1_36_1","volume-title":"McDRAM v2: In-Dynamic Random Access Memory Systolic Array Accelerator to Address the Large Model Problem in Deep Neural Networks on the Edge","author":"Cho Seunghwan","year":"2020","unstructured":"Seunghwan Cho, Haerang Choi, Eunhyeok Park, Hyunsung Shin, and Sungjoo Yoo. 2020. McDRAM v2: In-Dynamic Random Access Memory Systolic Array Accelerator to Address the Large Model Problem in Deep Neural Networks on the Edge. IEEE Access (2020)."},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"crossref","unstructured":"Jack Choquette Wishwesh Gandhi Olivier Giroux Nick Stam and Ronny Krashinsky. 2021. NVIDIA A100 Tensor Core GPU: Performance and Innovation. In MICRO.","DOI":"10.1109\/MM.2021.3061394"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"crossref","unstructured":"Eric Chung Jeremy Fowers Kalin Ovtcharov Michael Papamichael Adrian Caulfield Todd Massengill Ming Liu Daniel Lo Shlomi Alkalay Michael Haselman 2018. Serving DNNs in Real Time at Datacenter Scale with Project Brainwave. In MICRO.","DOI":"10.1109\/MM.2018.022071131"},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"crossref","unstructured":"Aline\u00a0S Cordeiro Sairo\u00a0R dos Santos Francis\u00a0B Moreira Paulo\u00a0C Santos Luigi Carro and Marco\u00a0AZ Alves. 2021. Machine Learning Migration for Efficient Near-Data Processing. In PDP.","DOI":"10.1016\/j.micpro.2022.104435"},{"key":"e_1_3_2_1_40_1","volume-title":"Support-Vector Networks. Machine Learning","author":"Cortes Corinna","year":"1995","unstructured":"Corinna Cortes. 1995. Support-Vector Networks. Machine Learning (1995)."},{"key":"e_1_3_2_1_41_1","volume-title":"Formal Aspects of Language Modeling. arXiv:2311.04329","author":"Cotterell Ryan","year":"2023","unstructured":"Ryan Cotterell, Anej Svete, Clara Meister, Tianyu Liu, and Li Du. 2023. Formal Aspects of Language Modeling. arXiv:2311.04329 (2023)."},{"key":"e_1_3_2_1_42_1","unstructured":"Ben Cottier. 2023. Trends in the Dollar Training Cost of Machine Learning Systems. https:\/\/epochai.org\/blog\/trends-in-the-dollar-training-cost-of-machine-learning-systems"},{"key":"e_1_3_2_1_43_1","unstructured":"Criteo AI Lab. 2014. Criteo 1TB Click Logs Dataset. https:\/\/ailab.criteo.com\/download-criteo-1tb-click-logs-dataset\/."},{"key":"e_1_3_2_1_44_1","volume-title":"GraphH: A Processing-in-Memory Architecture for Large-Scale Graph Processing. TCAD","author":"Dai Guohao","year":"2018","unstructured":"Guohao Dai, Tianhao Huang, Yuze Chi, Jishen Zhao, Guangyu Sun, Yongpan Liu, Yu Wang, Yuan Xie, and Huazhong Yang. 2018. GraphH: A Processing-in-Memory Architecture for Large-Scale Graph Processing. TCAD (2018)."},{"key":"e_1_3_2_1_45_1","unstructured":"Guohao Dai Zhenhua Zhu Tianyu Fu Chiyue Wei Bangyan Wang Xiangyu Li Yuan Xie Huazhong Yang and Yu Wang. 2022. DIMMining: Pruning-Efficient and Parallel Graph Mining on Near-Memory-Computing. In ISCA."},{"key":"e_1_3_2_1_46_1","volume-title":"Sai Manoj\u00a0Pudukotai Dinakarrao, and Amlan Ganguly","author":"Das Prangon","year":"2022","unstructured":"Prangon Das, Purab\u00a0Ranjan Sutradhar, Mark Indovina, Sai Manoj\u00a0Pudukotai Dinakarrao, and Amlan Ganguly. 2022. Implementation and Evaluation of Deep Neural Networks in Commercially Available Processing in Memory Hardware. In SOCC."},{"key":"e_1_3_2_1_47_1","unstructured":"Christopher De\u00a0Sa Matthew Feldman Christopher R\u00e9 and Kunle Olukotun. 2017. Understanding and Optimizing Asynchronous Low-Precision Stochastic Gradient Descent. In ISCA."},{"key":"e_1_3_2_1_48_1","volume-title":"Optimal Distributed Online Prediction Using Mini-Batches. JMLR","author":"Dekel Ofer","year":"2012","unstructured":"Ofer Dekel, Ran Gilad-Bachrach, Ohad Shamir, and Lin Xiao. 2012. Optimal Distributed Online Prediction Using Mini-Batches. JMLR (2012)."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"crossref","unstructured":"Quan Deng Lei Jiang Youtao Zhang Minxuan Zhang and Jun Yang. 2018. DrAcc: a DRAM based Accelerator for Accurate CNN Inference. In DAC.","DOI":"10.1145\/3195970.3196029"},{"key":"e_1_3_2_1_50_1","volume-title":"Casper: Accelerating Stencil Computations Using Near-Cache Processing","author":"Denzler Alain","year":"2023","unstructured":"Alain Denzler, Geraldo\u00a0F Oliveira, Nastaran Hajinazar, Rahul Bera, Gagandeep Singh, Juan G\u00f3mez-Luna, and Onur Mutlu. 2023. Casper: Accelerating Stencil Computations Using Near-Cache Processing. IEEE Access (2023)."},{"key":"e_1_3_2_1_51_1","unstructured":"Tim Dettmers Mike Lewis Younes Belkada and Luke Zettlemoyer. 2022. GPT3.int8(): 8-bit Matrix Multiplication for Transformers at Scale. In NeurIPS."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"crossref","unstructured":"Fabrice Devaux. 2019. The true Processing in Memory accelerator. In HCS.","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"e_1_3_2_1_53_1","volume-title":"A Framework for High-throughput Sequence Alignment using Real Processing-in-Memory Systems. Bioinformatics","author":"Diab Safaa","year":"2023","unstructured":"Safaa Diab, Amir Nassereldine, Mohammed Alser, Juan G\u00f3mez\u00a0Luna, Onur Mutlu, and Izzat El\u00a0Hajj. 2023. A Framework for High-throughput Sequence Alignment using Real Processing-in-Memory Systems. Bioinformatics (2023)."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"crossref","unstructured":"Mario Drumond Alexandros Daglis Nooshin Mirzadeh Dmitrii Ustiugov Javier Picorel Babak Falsafi Boris Grot and Dionisios Pnevmatikatos. 2017. The Mondrian Data Engine.","DOI":"10.1145\/3079856.3080233"},{"key":"e_1_3_2_1_55_1","volume-title":"Snap ML: A Hierarchical Framework for Machine Learning. In NeurIPS.","author":"D\u00fcnner Celestine","year":"2018","unstructured":"Celestine D\u00fcnner, Thomas Parnell, Dimitrios Sarigiannis, Nikolas Ioannou, Andreea Anghel, Gummadi Ravi, Madhusudanan Kandasamy, and Haralampos Pozidis. 2018. Snap ML: A Hierarchical Framework for Machine Learning. In NeurIPS."},{"key":"e_1_3_2_1_56_1","volume-title":"ORIGAMI: A Heterogeneous Split Architecture for In-Memory Acceleration of Learning. arXiv:1812.11473","author":"Falahati Hajar","year":"2018","unstructured":"Hajar Falahati, Pejman Lotfi-Kamran, Mohammad Sadrosadati, and Hamid Sarbazi-Azad. 2018. ORIGAMI: A Heterogeneous Split Architecture for In-Memory Acceleration of Learning. arXiv:1812.11473 (2018)."},{"key":"e_1_3_2_1_57_1","unstructured":"Rong-En Fan. [n. d.]. LIBSVM Data: A Collection of Benchmarks for Support Vector Machine Research. https:\/\/www.csie.ntu.edu.tw\/\u00a0cjlin\/libsvmtools\/datasets\/."},{"key":"e_1_3_2_1_58_1","volume-title":"NDA: Near-DRAM Acceleration Architecture Leveraging Commodity DRAM Devices and Standard Memory Modules. In HPCA.","author":"Farmahini-Farahani Amin","year":"2015","unstructured":"Amin Farmahini-Farahani, Jung\u00a0Ho Ahn, Katherine Morrow, and Nam\u00a0Sung Kim. 2015. NDA: Near-DRAM Acceleration Architecture Leveraging Commodity DRAM Devices and Standard Memory Modules. In HPCA."},{"key":"e_1_3_2_1_59_1","volume-title":"MATSA: An MRAM-based Energy-Efficient Accelerator for Time Series Analysis","author":"Fernandez Ivan","year":"2024","unstructured":"Ivan Fernandez, Christina Giannoula, Aditya Manglik, Ricardo Quislant, Nika\u00a0Mansouri Ghiasi, Juan G\u00f3mez-Luna, Eladio Gutierrez, Oscar Plata, and Onur Mutlu. 2024. MATSA: An MRAM-based Energy-Efficient Accelerator for Time Series Analysis. IEEE Access (2024)."},{"key":"e_1_3_2_1_60_1","volume-title":"NATSA: A Near-Data Processing Accelerator for Time Series Analysis. In ICCD.","author":"Fernandez Ivan","year":"2020","unstructured":"Ivan Fernandez, Ricardo Quislant, Eladio Guti\u00e9rrez, Oscar Plata, Christina Giannoula, Mohammed Alser, Juan G\u00f3mez-Luna, and Onur Mutlu. 2020. NATSA: A Near-Data Processing Accelerator for Time Series Analysis. In ICCD."},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"crossref","unstructured":"Jo\u00e3o\u00a0Dinis Ferreira Gabriel Falcao Juan G\u00f3mez-Luna Mohammed Alser Lois Orosa Mohammad Sadrosadati Jeremie\u00a0S Kim Geraldo\u00a0F Oliveira Taha Shahroodi Anant Nori 2022. pLUTo: Enabling Massively Parallel Computation in DRAM via Lookup Tables. In MICRO.","DOI":"10.1109\/MICRO56248.2022.00067"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"crossref","unstructured":"Fei Gao Georgios Tziantzioulis and David Wentzlaff. 2019. ComputeDRAM: In-Memory Compute Using Off-the-Shelf DRAMs. In MICRO.","DOI":"10.1145\/3352460.3358260"},{"key":"e_1_3_2_1_63_1","unstructured":"Mingyu Gao Grant Ayers and Christos Kozyrakis. 2015. Practical Near-Data Processing for In-Memory Analytics Frameworks. In PACT."},{"key":"e_1_3_2_1_64_1","volume-title":"HRL: Efficient and flexible reconfigurable logic for near-data processing. In HPCA.","author":"Gao Mingyu","year":"2016","unstructured":"Mingyu Gao and Christos Kozyrakis. 2016. HRL: Efficient and flexible reconfigurable logic for near-data processing. In HPCA."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_66_1","volume-title":"Neural Networks and the Bias\/Variance Dilemma. Neural Computation","author":"Geman Stuart","year":"1992","unstructured":"Stuart Geman, Elie Bienenstock, and Ren\u00e9 Doursat. 1992. Neural Networks and the Bias\/Variance Dilemma. Neural Computation (1992)."},{"key":"e_1_3_2_1_67_1","unstructured":"Nika\u00a0Mansouri Ghiasi Mohammad Sadrosadati Harun Mustafa Arvid Gollwitzer Can Firtina Julien Eudine Haiyu Mao Jo\u00ebl Lindegger Meryem\u00a0Banu Cavlak Mohammed Alser Jisung Park and Onur Mutlu. 2024. MegIS: High-Performance Energy-Efficient and Low-Cost Metagenomic Analysis with In-Storage Processing. In ISCA."},{"key":"e_1_3_2_1_68_1","volume-title":"ALP: Alleviating CPU-Memory Data Movement Overheads in Memory-Centric Systems","author":"Ghiasi Nika\u00a0Mansouri","year":"2022","unstructured":"Nika\u00a0Mansouri Ghiasi, Nandita Vijaykumar, Geraldo\u00a0F Oliveira, Lois Orosa, Ivan Fernandez, Mohammad Sadrosadati, Konstantinos Kanellopoulos, Nastaran Hajinazar, Juan\u00a0G\u00f3mez Luna, and Onur Mutlu. 2022. ALP: Alleviating CPU-Memory Data Movement Overheads in Memory-Centric Systems. IEEE Transactions on Emerging Topics in Computing (2022)."},{"key":"e_1_3_2_1_69_1","volume-title":"Processing-In-Memory: A Workload-Driven Perspective. IBM Journal of Research and Development","author":"Ghose Saugata","year":"2019","unstructured":"Saugata Ghose, Amirali Boroumand, Jeremie\u00a0S Kim, Juan G\u00f3mez-Luna, and Onur Mutlu. 2019. Processing-In-Memory: A Workload-Driven Perspective. IBM Journal of Research and Development (2019)."},{"key":"e_1_3_2_1_70_1","doi-asserted-by":"publisher","DOI":"10.1145\/3508041"},{"key":"e_1_3_2_1_71_1","doi-asserted-by":"crossref","unstructured":"Christina Giannoula Nandita Vijaykumar Nikela Papadopoulou Vasileios Karakostas Ivan Fernandez Juan G\u00f3mez-Luna Lois Orosa Nectarios Koziris Georgios Goumas and Onur Mutlu. 2021. SynCron: Efficient Synchronization Support for Near-Data-Processing Architectures. In HPCA.","DOI":"10.1109\/HPCA51647.2021.00031"},{"key":"e_1_3_2_1_72_1","volume-title":"Accelerating Graph Neural Networks on Real Processing-In-Memory Systems. arXiv:2402.16731","author":"Giannoula Christina","year":"2024","unstructured":"Christina Giannoula, Peiming Yang, Ivan\u00a0Fernandez Vega, Jiacheng Yang, Yu\u00a0Xin Li, Juan\u00a0Gomez Luna, Mohammad Sadrosadati, Onur Mutlu, and Gennady Pekhimenko. 2024. Accelerating Graph Neural Networks on Real Processing-In-Memory Systems. arXiv:2402.16731 (2024)."},{"key":"e_1_3_2_1_73_1","volume-title":"SwiftRL: Towards Efficient Reinforcement Learning on Real Processing-In-Memory Systems. arXiv:2405.03967","author":"Gogineni Kailash","year":"2024","unstructured":"Kailash Gogineni, Sai\u00a0Santosh Dayapule, Juan G\u00f3mez-Luna, Karthikeya Gogineni, Peng Wei, Tian Lan, Mohammad Sadrosadati, Onur Mutlu, and Guru Venkataramani. 2024. SwiftRL: Towards Efficient Reinforcement Learning on Real Processing-In-Memory Systems. arXiv:2405.03967 (2024)."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"crossref","unstructured":"Juan G\u00f3mez-Luna Izzat El\u00a0Hajj Ivan Fernandez Christina Giannoula Geraldo\u00a0F Oliveira and Onur Mutlu. 2021. Benchmarking Memory-Centric Computing Systems: Analysis of Real Processing-In-Memory Hardware. In IGSC.","DOI":"10.1109\/IGSC54211.2021.9651614"},{"key":"e_1_3_2_1_75_1","volume-title":"Benchmarking a New Paradigm: Experimental Analysis and Characterization of a Real Processing-In-Memory System","author":"G\u00f3mez-Luna Juan","year":"2022","unstructured":"Juan G\u00f3mez-Luna, Izzat El\u00a0Hajj, Ivan Fernandez, Christina Giannoula, Geraldo\u00a0F Oliveira, and Onur Mutlu. 2022. Benchmarking a New Paradigm: Experimental Analysis and Characterization of a Real Processing-In-Memory System. IEEE Access (2022)."},{"key":"e_1_3_2_1_76_1","volume-title":"An Experimental Evaluation of Machine Learning Training on a Real Processing-In-Memory System. arXiv:2207.07886","author":"G\u00f3mez-Luna Juan","year":"2022","unstructured":"Juan G\u00f3mez-Luna, Yuxin Guo, Sylvan Brocard, Julien Legriel, Remy Cimadomo, Geraldo\u00a0F Oliveira, Gagandeep Singh, and Onur Mutlu. 2022. An Experimental Evaluation of Machine Learning Training on a Real Processing-In-Memory System. arXiv:2207.07886 (2022)."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"crossref","unstructured":"Juan G\u00f3mez-Luna Yuxin Guo Sylvan Brocard Julien Legriel Remy Cimadomo Geraldo\u00a0F Oliveira Gagandeep Singh and Onur Mutlu. 2023. Evaluating Machine Learning Workloads on Memory-Centric Computing Systems. In ISPASS.","DOI":"10.1109\/ISPASS57527.2023.00013"},{"key":"e_1_3_2_1_78_1","volume-title":"Deep Learning","author":"Goodfellow Ian","unstructured":"Ian Goodfellow, Yoshua Bengio, and Aaron Courville. 2016. Deep Learning. The MIT Press."},{"key":"e_1_3_2_1_79_1","volume-title":"Biscuit: A Framework for Near-Data Processing of Big Data Workloads. In ISCA.","author":"Gu Boncheol","year":"2016","unstructured":"Boncheol Gu, Andre\u00a0S Yoon, Duck-Ho Bae, Insoon Jo, Jinyoung Lee, Jonghyun Yoon, Jeong-Uk Kang, Moonsang Kwon, Chanho Yoon, Sangyeun Cho, 2016. Biscuit: A Framework for Near-Data Processing of Big Data Workloads. In ISCA."},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"crossref","unstructured":"Harshita Gupta Mayank Kabra Juan G\u00f3mez-Luna Konstantinos Kanellopoulos and Onur Mutlu. 2023. Evaluating Homomorphic Operations on a Real-World Processing-In-Memory System. In IISWC.","DOI":"10.1109\/IISWC59245.2023.00030"},{"key":"e_1_3_2_1_81_1","volume-title":"Benchmarking a New Paradigm: An Experimental Analysis of a Real Processing-in-Memory Architecture. arXiv:2105.03814","author":"G\u00f3mez-Luna Juan","year":"2021","unstructured":"Juan G\u00f3mez-Luna, Izzat\u00a0El Hajj, Ivan Fernandez, Christina Giannoula, Geraldo\u00a0F. Oliveira, and Onur Mutlu. 2021. Benchmarking a New Paradigm: An Experimental Analysis of a Real Processing-in-Memory Architecture. arXiv:2105.03814 (2021)."},{"key":"e_1_3_2_1_82_1","volume-title":"SIMDRAM: An End-to-End Framework for Bit-Serial SIMD Computing in DRAM. In ASPLOS.","author":"Hajinazar Nastaran","year":"2021","unstructured":"Nastaran Hajinazar, Geraldo\u00a0F Oliveira, Sven Gregorio, Jo\u00e3o\u00a0Dinis Ferreira, Nika\u00a0Mansouri Ghiasi, Minesh Patel, Mohammed Alser, Saugata Ghose, Juan G\u00f3mez-Luna, and Onur Mutlu. 2021. SIMDRAM: An End-to-End Framework for Bit-Serial SIMD Computing in DRAM. In ASPLOS."},{"key":"e_1_3_2_1_83_1","volume":"201","author":"Han Song","unstructured":"Song Han, Huizi Mao, and William\u00a0J Dally. 2015. Deep Compression: Compressing Deep Neural Networks with Pruning, Trained Quantization and Huffman Coding. arXiv:1510.00149 (2015).","journal-title":"J Dally."},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"crossref","unstructured":"Milad Hashemi Khubaib Eiman Ebrahimi Onur Mutlu and Yale\u00a0N Patt. 2016. Accelerating Dependent Cache Misses with an Enhanced Memory Controller. In ISCA.","DOI":"10.1109\/ISCA.2016.46"},{"key":"e_1_3_2_1_85_1","volume-title":"Continuous Runahead: Transparent Hardware Acceleration for Memory Intensive Workloads. In MICRO.","author":"Hashemi Milad","year":"2016","unstructured":"Milad Hashemi, Onur Mutlu, and Yale\u00a0N Patt. 2016. Continuous Runahead: Transparent Hardware Acceleration for Memory Intensive Workloads. In MICRO."},{"key":"e_1_3_2_1_86_1","doi-asserted-by":"crossref","unstructured":"Syed\u00a0Minhaj Hassan Sudhakar Yalamanchili and Saibal Mukhopadhyay. 2015. Near Data Processing: Impact and Optimization of 3D Memory System Architecture on the Uncore. In MEMSYS.","DOI":"10.1145\/2818950.2818952"},{"key":"e_1_3_2_1_87_1","volume-title":"The Elements of Statistical Learning","author":"Hastie Trevor","unstructured":"Trevor Hastie, Robert Tibshirani, and Jerome Friedman. 2009. The Elements of Statistical Learning. Springer."},{"key":"e_1_3_2_1_88_1","volume-title":"Enabling Fast and Energy-Efficient FM-index Exact Matching using Processing-Near-Memory. The Journal of Supercomputing","author":"Herruzo M","year":"2021","unstructured":"Jose\u00a0M Herruzo, Ivan Fernandez, Sonia Gonz\u00e1lez-Navarro, and Oscar Plata. 2021. Enabling Fast and Energy-Efficient FM-index Exact Matching using Processing-Near-Memory. The Journal of Supercomputing (2021)."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"crossref","unstructured":"Kevin Hsieh Eiman Ebrahimi Gwangsun Kim Niladrish Chatterjee Mike O\u2019Connor Nandita Vijaykumar Onur Mutlu and Stephen\u00a0W Keckler. 2016. Transparent Offloading and Mapping (TOM): Enabling Programmer-Transparent Near-Data Processing in GPU Systems. In ISCA.","DOI":"10.1109\/ISCA.2016.27"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"crossref","unstructured":"Kevin Hsieh Samira Khan Nandita Vijaykumar Kevin\u00a0K Chang Amirali Boroumand Saugata Ghose and Onur Mutlu. 2016. Accelerating Pointer Chasing in 3D-Stacked Memory: Challenges Mechanisms Evaluation. In ICCD.","DOI":"10.1109\/ICCD.2016.7753257"},{"key":"e_1_3_2_1_91_1","volume-title":"Using AUC and Accuracy in Evaluating Learning Algorithms","author":"Huang Jin","year":"2005","unstructured":"Jin Huang and Charles\u00a0X Ling. 2005. Using AUC and Accuracy in Evaluating Learning Algorithms. IEEE Transactions on Knowledge and Data Engineering (2005)."},{"key":"e_1_3_2_1_92_1","volume-title":"Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology. arXiv:2308.00846","author":"Hyun Bongjoon","year":"2023","unstructured":"Bongjoon Hyun, Taehun Kim, Dongjae Lee, and Minsoo Rhu. 2023. Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology. arXiv:2308.00846 (2023)."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"crossref","unstructured":"Bongjoon Hyun Taehun Kim Dongjae Lee and Minsoo Rhu. 2024. Pathfinding Future PIM Architectures by Demystifying a Commercial PIM Technology. In HPCA.","DOI":"10.1109\/HPCA57654.2024.00029"},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"crossref","unstructured":"Mohsen Imani Saransh Gupta Yeseong Kim and Tajana Rosing. 2019. FloatPIM: In-Memory Acceleration of Deep Neural Network Training with High Precision. In ISCA.","DOI":"10.1145\/3307650.3322237"},{"key":"e_1_3_2_1_95_1","doi-asserted-by":"crossref","unstructured":"Maurus Item Juan G\u00f3mez-Luna Geraldo\u00a0F. Oliveira Mohammad Sadrosadati Yuxin Guo and Onur Mutlu. 2023. TransPimLib: Efficient Transcendental Functions for Processing-in-Memory Systems. In ISPASS.","DOI":"10.1109\/ISPASS57527.2023.00031"},{"key":"e_1_3_2_1_96_1","unstructured":"Andrei Ivanov Nikoli Dryden Tal Ben-Nun Shigang Li and Torsten Hoefler. 2021. Data Movement is All You Need: A Case Study on Optimizing Transformers. In MLSys."},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1145\/3357526.3357552"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"crossref","unstructured":"Jiawei Jiang Shaoduo Gan Yue Liu Fanlin Wang Gustavo Alonso Ana Klimovic Ankit Singla Wentao Wu and Ce Zhang. 2021. Towards Demystifying Serverless Machine Learning Training. In SIGMOD.","DOI":"10.1145\/3448016.3459240"},{"key":"e_1_3_2_1_99_1","volume-title":"Scalability Limitations of Processing-in-Memory using Real System Evaluations. POMACS","author":"Jonatan Gilbert","year":"2024","unstructured":"Gilbert Jonatan, Haeyoon Cho, Hyojun Son, Xiangyu Wu, Neal Livesay, Evelio Mora, Kaustubh Shivdikar, Jos\u00e9\u00a0L Abell\u00e1n, Ajay Joshi, David Kaeli, 2024. Scalability Limitations of Processing-in-Memory using Real System Evaluations. POMACS (2024)."},{"key":"e_1_3_2_1_100_1","doi-asserted-by":"crossref","unstructured":"Hongbo Kang Yiwei Zhao Guy\u00a0E Blelloch Laxman Dhulipala Yan Gu Charles McGuffey and Phillip\u00a0B Gibbons. 2023. PIM-trie: A Skew-resistant Trie for Processing-in-Memory. In SPAA.","DOI":"10.1145\/3558481.3591070"},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1109\/T-C.1969.222754"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"crossref","unstructured":"Liu Ke Udit Gupta Benjamin\u00a0Youngjae Cho David Brooks Vikas Chandra Utku Diril Amin Firoozshahian Kim Hazelwood Bill Jia Hsien-Hsin\u00a0S Lee 2020. RecNMP: Accelerating Personalized Recommendation with Near-Memory Processing. In ISCA.","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_1_103_1","volume-title":"Near-Memory Processing in Action: Accelerating Personalized Recommendation With AXDIMM. MICRO","author":"Ke Liu","year":"2021","unstructured":"Liu Ke, Xuan Zhang, Jinin So, Jong-Geon Lee, Shin-Haeng Kang, Sukhan Lee, Songyi Han, YeonGon Cho, Jin\u00a0Hyun Kim, Yongsuk Kwon, 2021. Near-Memory Processing in Action: Accelerating Personalized Recommendation With AXDIMM. MICRO (2021)."},{"key":"e_1_3_2_1_104_1","volume-title":"Hamid Farzaneh, and Jeronimo Castrillon.","author":"Khan Asif\u00a0Ali","year":"2024","unstructured":"Asif\u00a0Ali Khan, Jo\u00e3o Paulo\u00a0C De\u00a0Lima, Hamid Farzaneh, and Jeronimo Castrillon. 2024. The Landscape of Compute-Near-Memory and Compute-In-Memory: A Research and Commercial Overview. arXiv:2401.14428 (2024)."},{"key":"e_1_3_2_1_105_1","volume-title":"A Compilation Infrastructure for Heterogeneous Compute In-Memory and Compute Near-Memory Paradigms. arXiv:2301.07486","author":"Khan Asif\u00a0Ali","year":"2022","unstructured":"Asif\u00a0Ali Khan, Hamid Farzaneh, Karl\u00a0FA Friebel, Cl\u00e9ment Fournier, Lorenzo Chelini, and Jeronimo Castrillon. 2022. CINM (Cinnamon): A Compilation Infrastructure for Heterogeneous Compute In-Memory and Compute Near-Memory Paradigms. arXiv:2301.07486 (2022)."},{"key":"e_1_3_2_1_106_1","volume-title":"MViD: Sparse Matrix-Vector Multiplication in Mobile DRAM for Accelerating Recurrent Neural Networks","author":"Kim Byeongho","year":"2020","unstructured":"Byeongho Kim, Jongwook Chung, Eojin Lee, Wonkyung Jung, Sunjung Lee, Jaewan Choi, Jaehyun Park, Minbok Wi, Sukhan Lee, and Jung\u00a0Ho Ahn. 2020. MViD: Sparse Matrix-Vector Multiplication in Mobile DRAM for Accelerating Recurrent Neural Networks. IEEE Transactions on Computers (2020)."},{"key":"e_1_3_2_1_107_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001178"},{"key":"e_1_3_2_1_108_1","doi-asserted-by":"crossref","unstructured":"Heesu Kim Hanmin Park Taehyun Kim Kwanheum Cho Eojin Lee Soojung Ryu Hyuk-Jae Lee Kiyoung Choi and Jinho Lee. 2021. GradPIM: A Practical Processing-in-DRAM Architecture for Gradient Descent. In HPCA.","DOI":"10.1109\/HPCA51647.2021.00030"},{"key":"e_1_3_2_1_109_1","volume-title":"GRIM-Filter: Fast Seed Location Filtering in DNA Read Mapping Using Processing-in-Memory Technologies. arXiv:1708.04329","author":"Kim S","year":"2017","unstructured":"Jeremie\u00a0S Kim, Damla Senol, Hongyi Xin, Donghyuk Lee, Saugata Ghose, Mohammed Alser, Hasan Hassan, Oguz Ergin, Can Alkan, and Onur Mutlu. 2017. GRIM-Filter: Fast Seed Location Filtering in DNA Read Mapping Using Processing-in-Memory Technologies. arXiv:1708.04329 (2017)."},{"key":"e_1_3_2_1_110_1","volume-title":"Optimal Model Partitioning with Low-Overhead Profiling on the PIM-based Platform for Deep Learning Inference. TODAES","author":"Kim Seok\u00a0Young","year":"2024","unstructured":"Seok\u00a0Young Kim, Jaewook Lee, Yoonah Paik, Chang\u00a0Hyun Kim, Won\u00a0Jun Lee, and Seon\u00a0Wook Kim. 2024. Optimal Model Partitioning with Low-Overhead Profiling on the PIM-based Platform for Deep Learning Inference. TODAES (2024)."},{"key":"e_1_3_2_1_111_1","volume-title":"Adam: A Method for Stochastic Optimization. arXiv:1412.6980","author":"Kingma P","year":"2014","unstructured":"Diederik\u00a0P Kingma. 2014. Adam: A Method for Stochastic Optimization. arXiv:1412.6980 (2014)."},{"key":"e_1_3_2_1_112_1","doi-asserted-by":"crossref","unstructured":"Youngeun Kwon Yunjae Lee and Minsoo Rhu. 2019. TensorDIMM: A Practical Near-Memory Processing Architecture for Embeddings and Tensor Operations in Deep Learning. In MICRO.","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_113_1","unstructured":"Young-Cheon Kwon Suk\u00a0Han Lee Jaehoon Lee Sang-Hyuk Kwon Je\u00a0Min Ryu Jong-Pil Son O Seongil Hak-Soo Yu Haesuk Lee Soo\u00a0Young Kim 2021. 25.4 A 20nm 6GB Function-In-Memory DRAM Based on HBM2 with a 1.2TFLOPS Programmable Computing Unit Using Bank-Level Parallelism for Machine Learning Applications. In ISSCC."},{"key":"e_1_3_2_1_114_1","volume-title":"Communication-Efficient Algorithms for Decentralized and Stochastic Optimization. Mathematical Programming","author":"Lan Guanghui","year":"2020","unstructured":"Guanghui Lan, Soomin Lee, and Yi Zhou. 2020. Communication-Efficient Algorithms for Decentralized and Stochastic Optimization. Mathematical Programming (2020)."},{"key":"e_1_3_2_1_115_1","doi-asserted-by":"crossref","unstructured":"Dominique Lavenier Remy Cimadomo and Romaric Jodin. 2020. Variant Calling Parallelization on Processor-in-Memory Architecture. In BIBM.","DOI":"10.1101\/2020.11.03.366237"},{"key":"e_1_3_2_1_116_1","unstructured":"Dominique Lavenier Charles Deltel David Furodet and Jean-Fran\u00e7ois Roy. 2016. BLAST on UPMEM. Ph.\u00a0D. Dissertation. INRIA Rennes-Bretagne Atlantique."},{"key":"e_1_3_2_1_117_1","unstructured":"Donghun Lee Jinin So Minseon Ahn Jong-Geon Lee Jungmin Kim Jeonghyeon Cho Rebholz Oliver Vishnu\u00a0Charan Thummala Ravi\u00a0shankar JV Sachin\u00a0Suresh Upadhya 2022. Improving In-Memory Database Operations with Acceleration DIMM (AxDIMM). In DaMoN."},{"key":"e_1_3_2_1_118_1","unstructured":"Joo\u00a0Hwan Lee Jaewoong Sim and Hyesoon Kim. 2015. BSSync: Processing Near Memory for Machine Learning Workloads with Bounded Staleness Consistency Models. In PACT."},{"key":"e_1_3_2_1_119_1","unstructured":"Sukhan Lee Shin-haeng Kang Jaehoon Lee Hyeonsu Kim Eojin Lee Seungwoo Seo Hosang Yoon Seungwon Lee Kyounghwan Lim Hyunsung Shin 2021. Hardware Architecture and Software Stack for PIM Based on Commercial DRAM Technology. In ISCA."},{"key":"e_1_3_2_1_120_1","unstructured":"Seongju Lee Kyuyoung Kim Sanghoon Oh Joonhong Park Gimoon Hong Dongyoon Ka Kyudong Hwang Jeongje Park Kyeongpil Kang Jungyeon Kim 2022. A 1ynm 1.25V 8Gb 16Gb\/s\/pin GDDR6-based Accelerator-in-Memory supporting 1TFLOPS MAC Operation and Various Activation Functions for Deep-Learning Applications. In ISSCC."},{"key":"e_1_3_2_1_121_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3077294"},{"key":"e_1_3_2_1_122_1","doi-asserted-by":"publisher","DOI":"10.1145\/3375699"},{"key":"e_1_3_2_1_123_1","doi-asserted-by":"crossref","unstructured":"Mu Li David\u00a0G Andersen Alexander\u00a0J Smola and Kai Yu. 2014. Communication Efficient Distributed Machine Learning with the Parameter Server. NeurIPS.","DOI":"10.1145\/2640087.2644155"},{"key":"e_1_3_2_1_124_1","volume":"201","author":"Li Mu","unstructured":"Mu Li, Tong Zhang, Yuqiang Chen, and Alexander\u00a0J Smola. 2014. Efficient Mini-Batch Training for Stochastic Optimization. In SIGKDD.","journal-title":"J Smola."},{"key":"e_1_3_2_1_125_1","unstructured":"Xiangru Lian Ce Zhang Huan Zhang Cho-Jui Hsieh Wei Zhang and Ji Liu. 2017. Can Decentralized Algorithms Outperform Centralized Algorithms? A Case Study for Decentralized Parallel Stochastic Gradient Descent. In NeurIPS."},{"key":"e_1_3_2_1_126_1","volume-title":"Design and Analysis of a Processing-in-DIMM Join Algorithm: A Case Study with UPMEM DIMMs. PACMMOD","author":"Lim Chaemin","year":"2023","unstructured":"Chaemin Lim, Suhyun Lee, Jinwoo Choi, Jounghoo Lee, Seongyeon Park, Hanjun Kim, Jinho Lee, and Youngsok Kim. 2023. Design and Analysis of a Processing-in-DIMM Join Algorithm: A Case Study with UPMEM DIMMs. PACMMOD (2023)."},{"key":"e_1_3_2_1_127_1","doi-asserted-by":"publisher","DOI":"10.1561\/9781680837018"},{"key":"e_1_3_2_1_128_1","unstructured":"Jiawen Liu Hengyu Zhao Matheus\u00a0A Ogleari Dong Li and Jishen Zhao. 2018. Processing-In-Memory for Energy-Efficient Neural Network Training: A Heterogeneous Approach. In MICRO."},{"key":"e_1_3_2_1_129_1","doi-asserted-by":"crossref","unstructured":"Zhiyu Liu Irina Calciu Maurice Herlihy and Onur Mutlu. 2017. Concurrent Data Structures for Near-Memory Computing. In SPAA.","DOI":"10.1145\/3087556.3087582"},{"key":"e_1_3_2_1_130_1","unstructured":"Yandong Luo and Shimeng Yu. 2020. Benchmark Non-Volatile and Volatile Memory Based Hybrid Precision Synapses for In-Situ Deep Neural Network Training. In ASP-DAC."},{"key":"e_1_3_2_1_131_1","volume-title":"TABLA: A Unified Template-based Framework for Accelerating Statistical Machine Learning. In HPCA.","author":"Mahajan Divya","year":"2016","unstructured":"Divya Mahajan, Jongse Park, Emmanuel Amaro, Hardik Sharma, Amir Yazdanbakhsh, Joon\u00a0Kyung Kim, and Hadi Esmaeilzadeh. 2016. TABLA: A Unified Template-based Framework for Accelerating Statistical Machine Learning. In HPCA."},{"key":"e_1_3_2_1_132_1","doi-asserted-by":"crossref","unstructured":"Nika Mansouri\u00a0Ghiasi Jisung Park Harun Mustafa Jeremie Kim Ataberk Olgun Arvid Gollwitzer Damla Senol\u00a0Cali Can Firtina Haiyu Mao Nour Almadhoun\u00a0Alserr 2022. GenStore: A High-Performance In-Storage Processing System for Genome Sequence Analysis. In ASPLOS.","DOI":"10.1145\/3503222.3507702"},{"key":"e_1_3_2_1_133_1","doi-asserted-by":"crossref","unstructured":"Haiyu Mao Mohammed Alser Mohammad Sadrosadati Can Firtina Akanksha Baranwal Damla\u00a0Senol Cali Aditya Manglik Nour\u00a0Almadhoun Alserr and Onur Mutlu. 2022. GenPIP: In-Memory Acceleration of Genome Analysis via Tight Integration of Basecalling and Read Mapping. In MICRO.","DOI":"10.1109\/MICRO56248.2022.00056"},{"key":"e_1_3_2_1_134_1","unstructured":"Ryan McDonald Keith Hall and Gideon Mann. 2010. Distributed Training Strategies for the Structured Perceptron. In NAACL HLT."},{"key":"e_1_3_2_1_135_1","volume-title":"Intelligent Architectures for Intelligent Computing Systems. DATE","author":"Mutlu Onur","year":"2021","unstructured":"Onur Mutlu. 2021. Intelligent Architectures for Intelligent Computing Systems. DATE (2021)."},{"key":"e_1_3_2_1_136_1","unstructured":"Onur Mutlu. 2023. Memory-Centric Computing. In DAC."},{"key":"e_1_3_2_1_137_1","doi-asserted-by":"crossref","unstructured":"Onur Mutlu Saugata Ghose Juan G\u00f3mez-Luna and Rachata Ausavarungnirun. 2019. Enabling Practical Processing in and near Memory for Data-Intensive Computing. In DAC.","DOI":"10.1145\/3316781.3323476"},{"key":"e_1_3_2_1_138_1","volume-title":"Processing Data Where It Makes Sense: Enabling In-Memory Computation. Microprocessors and Microsystems","author":"Mutlu Onur","year":"2019","unstructured":"Onur Mutlu, Saugata Ghose, Juan G\u00f3mez-Luna, and Rachata Ausavarungnirun. 2019. Processing Data Where It Makes Sense: Enabling In-Memory Computation. Microprocessors and Microsystems (2019)."},{"key":"e_1_3_2_1_139_1","volume-title":"Emerging Computing: From Devices to Systems: Looking Beyond Moore and Von Neumann","author":"Mutlu Onur","unstructured":"Onur Mutlu, Saugata Ghose, Juan G\u00f3mez-Luna, and Rachata Ausavarungnirun. 2022. A Modern Primer on Processing in Memory. In Emerging Computing: From Devices to Systems: Looking Beyond Moore and Von Neumann. Springer."},{"key":"e_1_3_2_1_140_1","unstructured":"Lifeng Nai Ramyad Hadidi Jaewoong Sim Hyojong Kim Pranith Kumar and Hyesoon Kim. 2017. GraphPIM: Enabling Instruction-Level PIM Offloading in Graph Computing Frameworks. In HPCA."},{"key":"e_1_3_2_1_141_1","unstructured":"Joel Nider Craig Mustard Andrada Zoltan John Ramsden Larry Liu Jacob Grossbard Mohammad Dashti Romaric Jodin Alexandre Ghiti Jordi Chauzi 2021. A Case Study of Processing-in-Memory in off-the-Shelf Systems. In USENIX."},{"key":"e_1_3_2_1_142_1","doi-asserted-by":"crossref","unstructured":"Dimin Niu Shuangchen Li Yuhao Wang Wei Han Zhe Zhang Yijin Guan Tianchan Guan Fei Sun Fei Xue Lide Duan 2022. 184QPS\/W 64Mb\/mm2 3D Logic-to-DRAM Hybrid Bonding with Process-Near-Memory Engine for Recommendation System. In ISSCC.","DOI":"10.1109\/ISSCC42614.2022.9731694"},{"key":"e_1_3_2_1_143_1","unstructured":"NVIDIA. 2020. NVIDIA A100 Tensor Core GPU Architecture. White Paper. https:\/\/images.nvidia.com\/aem-dam\/en-zz\/Solutions\/data-center\/nvidia-ampere-architecture-whitepaper.pdf."},{"key":"e_1_3_2_1_144_1","volume-title":"PiDRAM: A Holistic End-to-end FPGA-based Framework for Processing-in-DRAM. TACO","author":"Olgun Ataberk","year":"2022","unstructured":"Ataberk Olgun, Juan\u00a0G\u00f3mez Luna, Konstantinos Kanellopoulos, Behzad Salami, Hasan Hassan, Oguz Ergin, and Onur Mutlu. 2022. PiDRAM: A Holistic End-to-end FPGA-based Framework for Processing-in-DRAM. TACO (2022)."},{"key":"e_1_3_2_1_145_1","doi-asserted-by":"crossref","unstructured":"Geraldo\u00a0F Oliveira Amirali Boroumand Saugata Ghose Juan G\u00f3mez-Luna and Onur Mutlu. 2022. Heterogeneous Data-Centric Architectures for Modern Data-Intensive Applications: Case Studies in Machine Learning and Databases. In ISVLSI.","DOI":"10.1109\/ISVLSI54635.2022.00060"},{"key":"e_1_3_2_1_146_1","volume-title":"DAMOV: A New Methodology and Benchmark Suite for Evaluating Data Movement Bottlenecks","author":"Oliveira F","year":"2021","unstructured":"Geraldo\u00a0F Oliveira, Juan G\u00f3mez-Luna, Lois Orosa, Saugata Ghose, Nandita Vijaykumar, Ivan Fernandez, Mohammad Sadrosadati, and Onur Mutlu. 2021. DAMOV: A New Methodology and Benchmark Suite for Evaluating Data Movement Bottlenecks. IEEE Access (2021)."},{"key":"e_1_3_2_1_147_1","volume-title":"DaPPA: A Data-Parallel Framework for Processing-In-Memory Architectures. arXiv:2310.10168","author":"Oliveira F","year":"2023","unstructured":"Geraldo\u00a0F Oliveira, Alain Kohli, David Novo, Juan G\u00f3mez-Luna, and Onur Mutlu. 2023. DaPPA: A Data-Parallel Framework for Processing-In-Memory Architectures. arXiv:2310.10168 (2023)."},{"key":"e_1_3_2_1_148_1","volume-title":"MIMDRAM: An End-to-End Processing-Using-DRAM System for High-Throughput, Energy-Efficient and Programmer-Transparent Multiple-Instruction Multiple-Data Computing. In HPCA.","author":"Oliveira F","year":"2024","unstructured":"Geraldo\u00a0F Oliveira, Ataberk Olgun, Abdullah\u00a0Giray Ya\u011fl\u0131k\u00e7\u0131, F\u00a0Nisa Bostanc\u0131, Juan G\u00f3mez-Luna, Saugata Ghose, and Onur Mutlu. 2024. MIMDRAM: An End-to-End Processing-Using-DRAM System for High-Throughput, Energy-Efficient and Programmer-Transparent Multiple-Instruction Multiple-Data Computing. In HPCA."},{"key":"e_1_3_2_1_149_1","doi-asserted-by":"crossref","unstructured":"Jisung Park Roknoddin Azizi Geraldo\u00a0F Oliveira Mohammad Sadrosadati Rakesh Nadig David Novo Juan G\u00f3mez-Luna Myungsuk Kim and Onur Mutlu. 2022. Flash-Cosmos: In-Flash Bulk Bitwise Operations Using Inherent Computation Capability of NAND Flash Memory. In MICRO.","DOI":"10.1109\/MICRO56248.2022.00069"},{"key":"e_1_3_2_1_150_1","doi-asserted-by":"crossref","unstructured":"Jaehyun Park Byeongho Kim Sungmin Yun Eojin Lee Minsoo Rhu and Jung\u00a0Ho Ahn. 2021. TRiM: Enhancing Processor-Memory Interfaces with Scalable Tensor Reduction in Memory. In MICRO.","DOI":"10.1145\/3466752.3480080"},{"key":"e_1_3_2_1_151_1","volume-title":"High-throughput Near-Memory Processing on CNNs with 3D HBM-like Memory. TODAES","author":"Park Naebeom","year":"2021","unstructured":"Naebeom Park, Sungju Ryu, Jaeha Kung, and Jae-Joon Kim. 2021. High-throughput Near-Memory Processing on CNNs with 3D HBM-like Memory. TODAES (2021)."},{"key":"e_1_3_2_1_152_1","volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In NeurIPS."},{"key":"e_1_3_2_1_153_1","doi-asserted-by":"crossref","unstructured":"Ashutosh Pattnaik Xulong Tang Adwait Jog Onur Kayiran Asit\u00a0K Mishra Mahmut\u00a0T Kandemir Onur Mutlu and Chita\u00a0R Das. 2016. Scheduling Techniques for GPU Architectures with Processing-In-Memory Capabilities. In PACT.","DOI":"10.1145\/2967938.2967940"},{"key":"e_1_3_2_1_154_1","volume-title":"Trends, Challenges, and Perspectives. Journal of Systems Architecture","author":"Peccerillo Biagio","year":"2022","unstructured":"Biagio Peccerillo, Mirco Mannino, Andrea Mondelli, and Sandro Bartolini. 2022. A Survey on Hardware Accelerators: Taxonomy, Trends, Challenges, and Perspectives. Journal of Systems Architecture (2022)."},{"key":"e_1_3_2_1_155_1","unstructured":"Boris\u00a0T Polyak. 1987. Introduction to Optimization. Optimization Software."},{"key":"e_1_3_2_1_156_1","volume-title":"NDC: Analyzing the Impact of 3D-Stacked Memory+Logic Devices on MapReduce Workloads. In ISPASS.","author":"Pugsley H","year":"2014","unstructured":"Seth\u00a0H Pugsley, Jeffrey Jestes, Huihui Zhang, Rajeev Balasubramonian, Vijayalakshmi Srinivasan, Alper Buyuktosunoglu, Al Davis, and Feifei Li. 2014. NDC: Analyzing the Impact of 3D-Stacked Memory+Logic Devices on MapReduce Workloads. In ISPASS."},{"key":"e_1_3_2_1_157_1","volume-title":"A Stochastic Approximation Method. The Annals of Mathematical Statistics","author":"Robbins Herbert","year":"1951","unstructured":"Herbert Robbins and Sutton Monro. 1951. A Stochastic Approximation Method. The Annals of Mathematical Statistics (1951)."},{"key":"e_1_3_2_1_158_1","volume-title":"An Overview of Gradient Descent Optimization Algorithms. arXiv:1609.04747","author":"Ruder Sebastian","year":"2016","unstructured":"Sebastian Ruder. 2016. An Overview of Gradient Descent Optimization Algorithms. arXiv:1609.04747 (2016)."},{"key":"e_1_3_2_1_159_1","unstructured":"SAFARI Research Group. 2024. PIM-Opt Artifact \u2014 GitHub Repository. https:\/\/github.com\/CMU-SAFARI\/PIM-Opt."},{"key":"e_1_3_2_1_160_1","doi-asserted-by":"publisher","unstructured":"SAFARI Research Group. 2024. PIM-Opt Artifact \u2014 Zenodo Repository. https:\/\/doi.org\/10.5281\/zenodo.12747665.","DOI":"10.5281\/zenodo.12747665"},{"key":"e_1_3_2_1_161_1","doi-asserted-by":"crossref","unstructured":"Jyotishman Saikia Shihui Yin Zhewei Jiang Mingoo Seok and Jae-sun Seo. 2019. K-Nearest Neighbor Hardware Accelerator Using In-Memory Computing SRAM. In ISLPED.","DOI":"10.1109\/ISLPED.2019.8824822"},{"key":"e_1_3_2_1_162_1","volume-title":"A Scalable Near-Memory Architecture for Training Deep Neural Networks on Large In-Memory Datasets","author":"Schuiki Fabian","year":"2018","unstructured":"Fabian Schuiki, Michael Schaffner, Frank\u00a0K G\u00fcrkaynak, and Luca Benini. 2018. A Scalable Near-Memory Architecture for Training Deep Neural Networks on Large In-Memory Datasets. IEEE Transactions on Computers (2018)."},{"key":"e_1_3_2_1_163_1","volume-title":"Simple DRAM and Virtual Memory Abstractions to Enable Highly Efficient Memory Systems. arXiv:1605.06483","author":"Seshadri Vivek","year":"2016","unstructured":"Vivek Seshadri. 2016. Simple DRAM and Virtual Memory Abstractions to Enable Highly Efficient Memory Systems. arXiv:1605.06483 (2016)."},{"key":"e_1_3_2_1_164_1","volume-title":"Fast Bulk Bitwise AND and OR in DRAM. ICAL","author":"Seshadri Vivek","year":"2015","unstructured":"Vivek Seshadri, Kevin Hsieh, Amirali Boroum, Donghyuk Lee, Michael\u00a0A Kozuch, Onur Mutlu, Phillip\u00a0B Gibbons, and Todd\u00a0C Mowry. 2015. Fast Bulk Bitwise AND and OR in DRAM. ICAL (2015)."},{"key":"e_1_3_2_1_165_1","doi-asserted-by":"crossref","unstructured":"Vivek Seshadri Yoongu Kim Chris Fallin Donghyuk Lee Rachata Ausavarungnirun Gennady Pekhimenko Yixin Luo Onur Mutlu Phillip\u00a0B Gibbons Michael\u00a0A Kozuch 2013. RowClone: Fast and Energy-Efficient In-DRAM Bulk Data Copy and Initialization. In MICRO.","DOI":"10.1145\/2540708.2540725"},{"key":"e_1_3_2_1_166_1","volume-title":"Buddy-RAM: Improving the Performance and Efficiency of Bulk Bitwise Operations Using DRAM. arXiv:1611.09988","author":"Seshadri Vivek","year":"2016","unstructured":"Vivek Seshadri, Donghyuk Lee, Thomas Mullins, Hasan Hassan, Amirali Boroumand, Jeremie Kim, Michael\u00a0A Kozuch, Onur Mutlu, Phillip\u00a0B Gibbons, and Todd\u00a0C Mowry. 2016. Buddy-RAM: Improving the Performance and Efficiency of Bulk Bitwise Operations Using DRAM. arXiv:1611.09988 (2016)."},{"key":"e_1_3_2_1_167_1","doi-asserted-by":"publisher","DOI":"10.1145\/3123939.3124544"},{"key":"e_1_3_2_1_168_1","volume-title":"Advances in Computers","author":"Seshadri Vivek","unstructured":"Vivek Seshadri and Onur Mutlu. 2017. Simple Operations in Memory to Reduce Data Movement. In Advances in Computers. Elsevier."},{"key":"e_1_3_2_1_169_1","volume-title":"In-DRAM Bulk Bitwise Execution Engine. arXiv:1905.09822","author":"Seshadri Vivek","year":"2019","unstructured":"Vivek Seshadri and Onur Mutlu. 2019. In-DRAM Bulk Bitwise Execution Engine. arXiv:1905.09822 (2019)."},{"key":"e_1_3_2_1_170_1","volume-title":"Swordfish: A Framework for Evaluating Deep Neural Network-based Basecalling using Computation-In-Memory with Non-Ideal Memristors. In MICRO.","author":"Shahroodi Taha","year":"2023","unstructured":"Taha Shahroodi, Gagandeep Singh, Mahdi Zahedi, Haiyu Mao, Joel Lindegger, Can Firtina, Stephan Wong, Onur Mutlu, and Said Hamdioui. 2023. Swordfish: A Framework for Evaluating Deep Neural Network-based Basecalling using Computation-In-Memory with Non-Ideal Memristors. In MICRO."},{"key":"e_1_3_2_1_171_1","doi-asserted-by":"crossref","unstructured":"Charles\u00a0F Shelor and Krishna\u00a0M Kavi. 2019. Reconfigurable Dataflow Graphs for Processing-In-Memory. In ICDCN.","DOI":"10.1145\/3288599.3288605"},{"key":"e_1_3_2_1_172_1","volume-title":"McDRAM: Low Latency and Energy-Efficient Matrix Computations in DRAM. TCAD","author":"Shin Hyunsung","year":"2018","unstructured":"Hyunsung Shin, Dongyoung Kim, Eunhyeok Park, Sungho Park, Yongsik Park, and Sungjoo Yoo. 2018. McDRAM: Low Latency and Energy-Efficient Matrix Computations in DRAM. TCAD (2018)."},{"key":"e_1_3_2_1_173_1","volume-title":"NERO: A Near High-Bandwidth Memory Stencil Accelerator for Weather Prediction Modeling. In FPL.","author":"Singh Gagandeep","year":"2020","unstructured":"Gagandeep Singh, Dionysios Diamantopoulos, Christoph Hagleitner, Juan Gomez-Luna, Sander Stuijk, Onur Mutlu, and Henk Corporaal. 2020. NERO: A Near High-Bandwidth Memory Stencil Accelerator for Weather Prediction Modeling. In FPL."},{"key":"e_1_3_2_1_174_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317867"},{"key":"e_1_3_2_1_175_1","doi-asserted-by":"crossref","unstructured":"Benjamin Sirb and Xiaojing Ye. 2016. Consensus Optimization with Delayed and Stochastic Gradients on Decentralized Networks. In Big Data.","DOI":"10.1109\/BigData.2016.7840591"},{"key":"e_1_3_2_1_176_1","unstructured":"Sebastian\u00a0U Stich Jean-Baptiste Cordonnier and Martin Jaggi. 2018. Sparsified SGD with Memory. In NeurIPS."},{"key":"e_1_3_2_1_177_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.1970.5008902"},{"key":"e_1_3_2_1_178_1","doi-asserted-by":"publisher","DOI":"10.21437\/Interspeech.2015-354"},{"key":"e_1_3_2_1_179_1","doi-asserted-by":"crossref","unstructured":"Hanbo Sun Zhenhua Zhu Yi Cai Xiaoming Chen Yu Wang and Huazhong Yang. 2020. An Energy-Efficient Quantized and Regularized Training Framework for Processing-In-Memory Accelerators. In ASP-DAC.","DOI":"10.1109\/ASP-DAC47756.2020.9045192"},{"key":"e_1_3_2_1_180_1","doi-asserted-by":"crossref","unstructured":"Weiyi Sun Zhaoshi Li Shouyi Yin Shaojun Wei and Leibo Liu. 2021. ABC-DIMM: Alleviating the Bottleneck of Communication in DIMM-based Near-Memory Processing with Inter-DIMM Broadcast. In ISCA.","DOI":"10.1109\/ISCA52012.2021.00027"},{"key":"e_1_3_2_1_181_1","volume-title":"One-Step Regression and Classification with Cross-Point Resistive Memory Arrays. Science Advances","author":"Sun Zhong","year":"2020","unstructured":"Zhong Sun, Giacomo Pedretti, Alessandro Bricalli, and Daniele Ielmini. 2020. One-Step Regression and Classification with Cross-Point Resistive Memory Arrays. Science Advances (2020)."},{"key":"e_1_3_2_1_182_1","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"e_1_3_2_1_183_1","doi-asserted-by":"publisher","DOI":"10.1145\/3020078.3021744"},{"key":"e_1_3_2_1_184_1","unstructured":"UPMEM. 2022. Product Sheet UPMEM."},{"key":"e_1_3_2_1_185_1","unstructured":"UPMEM. 2022. UPMEM Processing In-Memory (PIM). UPMEM PIM Tech Paper."},{"key":"e_1_3_2_1_186_1","volume-title":"UPMEM PIM Platform for Data-Intensive Applications. In ABUMPIMP. Symposium as part of Euro-Par.","author":"UPMEM.","year":"2023","unstructured":"UPMEM. 2023. UPMEM PIM Platform for Data-Intensive Applications. In ABUMPIMP. Symposium as part of Euro-Par."},{"key":"e_1_3_2_1_187_1","unstructured":"UPMEM. 2023. UPMEM SDK Version 2023.2.0. https:\/\/sdk.upmem.com\/2023.2.0\/."},{"key":"e_1_3_2_1_188_1","unstructured":"UPMEM. 2024. UPMEM Website. https:\/\/www.upmem.com."},{"key":"e_1_3_2_1_189_1","doi-asserted-by":"crossref","unstructured":"Joao Vieira Nuno Roma Pedro Tom\u00e1s Paolo Ienne and Gabriel Falcao. 2018. Exploiting Compute Caches for Memory Bound Vector Operations. In SBAC-PAD.","DOI":"10.1109\/CAHPC.2018.8645905"},{"key":"e_1_3_2_1_190_1","volume-title":"Will we run out of Data? An Analysis of the Limits of scaling datasets in Machine Learning. arXiv:2211.04325","author":"Villalobos Pablo","year":"2022","unstructured":"Pablo Villalobos, Jaime Sevilla, Lennart Heim, Tamay Besiroglu, Marius Hobbhahn, and Anson Ho. 2022. Will we run out of Data? An Analysis of the Limits of scaling datasets in Machine Learning. arXiv:2211.04325 (2022)."},{"key":"e_1_3_2_1_191_1","unstructured":"Jue Wang Yucheng Lu Binhang Yuan Beidi Chen Percy Liang Christopher De\u00a0Sa Christopher Re and Ce Zhang. 2023. CocktailSGD: Fine-Tuning Foundation Models over 500Mbps Networks. In ICML."},{"key":"e_1_3_2_1_192_1","unstructured":"Jialei Wang Weiran Wang and Nathan Srebro. 2017. Memory and Communication Efficient Distributed Stochastic Optimization with Minibatch Prox. In COLT."},{"key":"e_1_3_2_1_193_1","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2020.3015777"},{"key":"e_1_3_2_1_194_1","doi-asserted-by":"crossref","unstructured":"Zeke Wang Kaan Kara Hantian Zhang Gustavo Alonso Onur Mutlu and Ce Zhang. 2019. Accelerating Generalized Linear Models with MLWeaving: A One-Size-Fits-All System for Any-Precision Learning. In VLDB.","DOI":"10.14778\/3317315.3317322"},{"key":"e_1_3_2_1_195_1","volume-title":"Lo-fi: Distributed Fine-tuning Without Communication. arXiv:2210.11948","author":"Wortsman Mitchell","year":"2022","unstructured":"Mitchell Wortsman, Suchin Gururangan, Shen Li, Ali Farhadi, Ludwig Schmidt, Michael Rabbat, and Ari\u00a0S Morcos. 2022. Lo-fi: Distributed Fine-tuning Without Communication. arXiv:2210.11948 (2022)."},{"key":"e_1_3_2_1_196_1","volume-title":"PIM-GPT: A Hybrid Process-in-Memory Accelerator for Autoregressive Transformers. arXiv:2310.09385","author":"Wu Yuting","year":"2023","unstructured":"Yuting Wu, Ziyu Wang, and Wei\u00a0D Lu. 2023. PIM-GPT: A Hybrid Process-in-Memory Accelerator for Autoregressive Transformers. arXiv:2310.09385 (2023)."},{"key":"e_1_3_2_1_197_1","unstructured":"Haocheng Xi Changhao Li Jianfei Chen and Jun Zhu. 2023. Training Transformers with 4-bit Integers. In NeurIPS."},{"key":"e_1_3_2_1_198_1","unstructured":"Xiaolong Xie Wei Tan Liana\u00a0L Fong and Yun Liang. 2017. CuMF_SGD: Parallelized Stochastic Gradient Descent for Matrix Factorization on GPUs. In HPDC."},{"key":"e_1_3_2_1_199_1","unstructured":"Hang Xu Chen-Yu Ho Ahmed\u00a0M Abdelmoniem Aritra Dutta El\u00a0Houcine Bergou Konstantinos Karatsenidis Marco Canini and Panos Kalnis. 2020. Compressed Communication for Distributed Deep Learning: Survey and Quantitative Evaluation. Technical Report."},{"key":"e_1_3_2_1_200_1","unstructured":"Fuzhao Xue Yao Fu Wangchunshu Zhou Zangwei Zheng and Yang You. 2024. To Repeat or Not To Repeat: Insights from Scaling LLM under Token-Crisis. In NeurIPS."},{"key":"e_1_3_2_1_201_1","doi-asserted-by":"crossref","unstructured":"Hao Yu Sen Yang and Shenghuo Zhu. 2019. Parallel Restarted SGD with Faster Convergence and Less Communication: Demystifying Why Model Averaging Works for Deep Learning. In AAAI.","DOI":"10.1609\/aaai.v33i01.33015693"},{"key":"e_1_3_2_1_202_1","doi-asserted-by":"crossref","unstructured":"\u0130smail\u00a0Emir Y\u00fcksel Yahya\u00a0Can Tu\u011frul Ataberk Olgun F\u00a0Nisa Bostanc\u0131 A\u00a0Giray Ya\u011fl\u0131k\u00e7\u0131 Geraldo\u00a0F Oliveira Haocong Luo Juan G\u00f3mez-Luna Mohammad Sadrosadati and Onur Mutlu. 2024. Functionally-Complete Boolean Logic in Real DRAM Chips: Experimental Characterization and Analysis. In HPCA.","DOI":"10.1109\/HPCA57654.2024.00030"},{"key":"e_1_3_2_1_203_1","volume-title":"Offloading Embedding Lookups to Processing-In-Memory for Deep Learning Recommender Models. Master\u2019s thesis","author":"Zarif Niloofar","unstructured":"Niloofar Zarif. 2023. Offloading Embedding Lookups to Processing-In-Memory for Deep Learning Recommender Models. Master\u2019s thesis. University of British Columbia."},{"key":"e_1_3_2_1_204_1","volume-title":"DimmWitted: A Study of Main-Memory Statistical Analytics. arXiv:1403.7550","author":"Zhang Ce","year":"2014","unstructured":"Ce Zhang and Christopher R\u00e9. 2014. DimmWitted: A Study of Main-Memory Statistical Analytics. arXiv:1403.7550 (2014)."},{"key":"e_1_3_2_1_205_1","doi-asserted-by":"crossref","unstructured":"Dongping Zhang Nuwan Jayasena Alexander Lyashevsky Joseph\u00a0L Greathouse Lifan Xu and Michael Ignatowski. 2014. TOP-PIM: Throughput-Oriented Programmable Processing in Memory. In HPDC.","DOI":"10.1145\/2600212.2600213"},{"key":"e_1_3_2_1_206_1","volume-title":"Parallel SGD: When does averaging help?arXiv:1606.07365","author":"Zhang Jian","year":"2016","unstructured":"Jian Zhang, Christopher De\u00a0Sa, Ioannis Mitliagkas, and Christopher R\u00e9. 2016. Parallel SGD: When does averaging help?arXiv:1606.07365 (2016)."},{"key":"e_1_3_2_1_207_1","doi-asserted-by":"crossref","unstructured":"Zhipeng Zhang Jiawei Jiang Wentao Wu Ce Zhang Lele Yu and Bin Cui. 2019. MLlib*: Fast Training of GLMs Using Spark MLlib. In ICDE.","DOI":"10.1109\/ICDE.2019.00194"},{"key":"e_1_3_2_1_208_1","volume-title":"On the Convergence Properties of a K-step Averaging Stochastic Gradient Descent Algorithm for Nonconvex Optimization. arXiv:1708.01012","author":"Zhou Fan","year":"2017","unstructured":"Fan Zhou and Guojing Cong. 2017. On the Convergence Properties of a K-step Averaging Stochastic Gradient Descent Algorithm for Nonconvex Optimization. arXiv:1708.01012 (2017)."},{"key":"e_1_3_2_1_209_1","volume-title":"Steven Chu\u00a0Hong Hoi","author":"Zhou Pan","year":"2020","unstructured":"Pan Zhou, Jiashi Feng, Chao Ma, Caiming Xiong, Steven Chu\u00a0Hong Hoi, 2020. Towards Theoretically Understanding Why SGD Generalizes Better Than ADAM in Deep Learning. In NeurIPS."},{"key":"e_1_3_2_1_210_1","unstructured":"Qiuling Zhu Tobias Graf H\u00a0Ekin Sumbul Larry Pileggi and Franz Franchetti. 2013. Accelerating Sparse Matrix-Matrix Multiplication with 3D-Stacked Logic-in-Memory Hardware. In HPEC."},{"key":"e_1_3_2_1_211_1","doi-asserted-by":"crossref","unstructured":"Youwei Zhuo Chao Wang Mingxing Zhang Rui Wang Dimin Niu Yanzhi Wang and Xuehai Qian. 2019. GraphQ: Scalable PIM-based Graph Processing. In MICRO.","DOI":"10.1145\/3352460.3358256"},{"key":"e_1_3_2_1_212_1","unstructured":"Martin Zinkevich Markus Weimer Lihong Li and Alex Smola. 2010. Parallelized Stochastic Gradient Descent. In NeurIPS."}],"event":{"name":"PACT '24: International Conference on Parallel Architectures and Compilation Techniques","location":"Long Beach CA USA","acronym":"PACT '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"]},"container-title":["Proceedings of the 2024 International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676947","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3656019.3676947","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:57:01Z","timestamp":1755892621000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676947"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,13]]},"references-count":212,"alternative-id":["10.1145\/3656019.3676947","10.1145\/3656019"],"URL":"https:\/\/doi.org\/10.1145\/3656019.3676947","relation":{},"subject":[],"published":{"date-parts":[[2024,10,13]]},"assertion":[{"value":"2024-10-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}