{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,9]],"date-time":"2026-04-09T00:01:40Z","timestamp":1775692900693,"version":"3.50.1"},"publisher-location":"New York, NY, USA","reference-count":104,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"NSF China","award":["62032001"],"award-info":[{"award-number":["62032001"]}]},{"name":"111 Project","award":["B18001"],"award-info":[{"award-number":["B18001"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640376","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"879-896","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":20,"title":["PIM-DL: Expanding the Applicability of Commodity DRAM-PIMs for Deep Learning via Algorithm-System Co-Optimization"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-7760-3254","authenticated-orcid":false,"given":"Cong","family":"Li","sequence":"first","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7929-8054","authenticated-orcid":false,"given":"Zhe","family":"Zhou","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, School of Computer Science, Peking University, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7322-4062","authenticated-orcid":false,"given":"Yang","family":"Wang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-9658-3062","authenticated-orcid":false,"given":"Fan","family":"Yang","sequence":"additional","affiliation":[{"name":"Nankai University, Tianjin, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9107-013X","authenticated-orcid":false,"given":"Ting","family":"Cao","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-6455-3898","authenticated-orcid":false,"given":"Mao","family":"Yang","sequence":"additional","affiliation":[{"name":"Microsoft Research, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-9076-7998","authenticated-orcid":false,"given":"Yun","family":"Liang","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7315-6589","authenticated-orcid":false,"given":"Guangyu","family":"Sun","sequence":"additional","affiliation":[{"name":"School of Integrated Circuits, Peking University, Beijing, China"},{"name":"Beijing Advanced Innovation Center for Integrated Circuits, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"et al. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg S Corrado, Andy Davis, Jeffrey Dean, Matthieu Devin, et al. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467, 2016."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2750386"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00080"},{"key":"e_1_3_2_1_4_1","volume-title":"IEEE","author":"Asghari-Moghaddam Hadi","year":"2016","unstructured":"Hadi Asghari-Moghaddam, Young Hoon Son, Jung Ho Ahn, and Nam Sung Kim. Chameleon: Versatile and practical near-dram acceleration architecture for large memory systems. In 2016 49th annual IEEE\/ACM international symposium on Microarchitecture (MICRO), pages 1--13. IEEE, 2016."},{"key":"e_1_3_2_1_5_1","volume-title":"Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint arXiv:1607.06450","author":"Ba Jimmy Lei","year":"2016","unstructured":"Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint arXiv:1607.06450, 2016."},{"key":"e_1_3_2_1_6_1","first-page":"2023","article-title":"Accelerating large table scan using processing-in-memory technology","author":"Baumstark Alexander","year":"2023","unstructured":"Alexander Baumstark, Muhammad Attahir Jibril, and Kai-Uwe Sattler. Accelerating large table scan using processing-in-memory technology. BTW 2023, 2023.","journal-title":"BTW"},{"key":"e_1_3_2_1_7_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592980.3595323"},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.1145\/3592980.3595312"},{"key":"e_1_3_2_1_9_1","series-title":"Proceedings of Machine Learning Research","first-page":"992","volume-title":"Proceedings of the 38th International Conference on Machine Learning, ICML","author":"Blalock Davis W.","year":"2021","unstructured":"Davis W. Blalock and John V. Guttag. Multiplying matrices without multiplying. In Marina Meila and Tong Zhang, editors, Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event, volume 139 of Proceedings of Machine Learning Research, pages 992--1004. PMLR, 2021."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.1145\/3307650.3322266"},{"key":"e_1_3_2_1_11_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_13_1","first-page":"3","volume-title":"Proceedings of the International Symposium on Memory Systems, MEMSYS 2017","author":"Cong Jason","year":"2017","unstructured":"Jason Cong, Zhenman Fang, Michael Gill, Farnoosh Javadi, and Glenn Reinman. AIM: accelerating computational genomics through scalable and noninvasive accelerator-interposed memory. In Proceedings of the International Symposium on Memory Systems, MEMSYS 2017, Alexandria, VA, USA, October 02 - 05, 2017, pages 3--14. ACM, 2017."},{"key":"e_1_3_2_1_14_1","volume-title":"Openmp: an industry standard api for shared-memory programming","author":"Dagum Leonardo","year":"1998","unstructured":"Leonardo Dagum and Ramesh Menon. Openmp: an industry standard api for shared-memory programming. IEEE computational science and engineering, 5(1):46--55, 1998."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2821565"},{"key":"e_1_3_2_1_16_1","doi-asserted-by":"publisher","DOI":"10.1109\/SOCC56010.2022.9908126"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1145\/3316781.3317845"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/HOTCHIPS.2019.8875680"},{"key":"e_1_3_2_1_20_1","volume-title":"Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805","author":"Devlin Jacob","year":"2018","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. Bert: Pre-training of deep bidirectional transformers for language understanding. arXiv preprint arXiv:1810.04805, 2018."},{"key":"e_1_3_2_1_21_1","volume-title":"Onur Mutlu, and Izzat El Hajj. A framework for high-throughput sequence alignment using real processing-in-memory systems. Bioinformatics, 39(5):btad155","author":"Diab Safaa","year":"2023","unstructured":"Safaa Diab, Amir Nassereldine, Mohammed Alser, Juan G\u00f3mez Luna, Onur Mutlu, and Izzat El Hajj. A framework for high-throughput sequence alignment using real processing-in-memory systems. Bioinformatics, 39(5):btad155, 2023."},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Learning Representations","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16\u00d716 words: Transformers for image recognition at scale. In International Conference on Learning Representations, 2021."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3437801.3441578"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2015.7056040"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00067"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2015.22"},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.1145\/3037697.3037702"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.1145\/3190508.3190541"},{"key":"e_1_3_2_1_30_1","unstructured":"Georgi Gerganov. Ggml tensor library for machine learning. https:\/\/github.com\/ggerganov\/ggml."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3547353.3522661"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00031"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"crossref","unstructured":"Juan G\u00f3mez-Luna Izzat El Hajj Ivan Fernandez Christina Giannoula Geraldo F Oliveira and Onur Mutlu. Benchmarking a new paradigm: An experimental analysis of a real processing-in-memory architecture. arXiv preprint arXiv:2105.03814 2021.","DOI":"10.1109\/ACCESS.2022.3174101"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISVLSI54635.2022.00064"},{"key":"e_1_3_2_1_35_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00071"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO.2018.00061"},{"key":"e_1_3_2_1_38_1","doi-asserted-by":"publisher","DOI":"10.1145\/3007787.3001159"},{"key":"e_1_3_2_1_39_1","unstructured":"Intel. Intel 64 and ia-32 architectures software developer manual volume 3b. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/intel-sdm.html."},{"key":"e_1_3_2_1_40_1","unstructured":"Intel. Intel advisor. https:\/\/www.intel.cn\/content\/www\/cn\/zh\/developer\/tools\/oneapi\/advisor.html#gs.2g5qlq."},{"key":"e_1_3_2_1_41_1","unstructured":"Intel. Intel oneapi math kernel library (mkl). https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/tools\/oneapi\/onemkl.html."},{"key":"e_1_3_2_1_42_1","unstructured":"Intel. oneapi deep neural network library. https:\/\/github.com\/oneapi-src\/oneDNN."},{"key":"e_1_3_2_1_43_1","unstructured":"Intel. Rapl power meter. https:\/\/www.intel.com\/content\/www\/us\/en\/developer\/articles\/technical\/software-security-guidance\/advisory-guidance\/running-average-power-limit-energy-reporting.html."},{"key":"e_1_3_2_1_44_1","volume-title":"Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093","author":"Jia Yangqing","year":"2014","unstructured":"Yangqing Jia, Evan Shelhamer, Jeff Donahue, Sergey Karayev, Jonathan Long, Ross Girshick, Sergio Guadarrama, and Trevor Darrell. Caffe: Convolutional architecture for fast feature embedding. arXiv preprint arXiv:1408.5093, 2014."},{"key":"e_1_3_2_1_45_1","first-page":"14745","article-title":"Two pure transformers can make one strong gan, and that can scale up","volume":"34","author":"Jiang Yifan","year":"2021","unstructured":"Yifan Jiang, Shiyu Chang, and Zhangyang Wang. Transgan: Two pure transformers can make one strong gan, and that can scale up. Advances in Neural Information Processing Systems, 34:14745--14758, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00010"},{"key":"e_1_3_2_1_47_1","doi-asserted-by":"publisher","DOI":"10.1145\/3079856.3080246"},{"key":"e_1_3_2_1_48_1","volume-title":"Pim-tree: A skew-resistant index for processing-in-memory. arXiv preprint arXiv:2211.10516","author":"Kang Hongbo","year":"2022","unstructured":"Hongbo Kang, Yiwei Zhao, Guy E Blelloch, Laxman Dhulipala, Yan Gu, Charles McGuffey, and Phillip B Gibbons. Pim-tree: A skew-resistant index for processing-in-memory. arXiv preprint arXiv:2211.10516, 2022."},{"key":"e_1_3_2_1_49_1","doi-asserted-by":"publisher","DOI":"10.1145\/3558481.3591070"},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA45697.2020.00070"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1109\/TCAD.2018.2858358"},{"key":"e_1_3_2_1_53_1","volume-title":"Learning multiple layers of features from tiny images","author":"Krizhevsky Alex","year":"2009","unstructured":"Alex Krizhevsky, Geoffrey Hinton, et al. Learning multiple layers of features from tiny images. 2009."},{"key":"e_1_3_2_1_54_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS55958.2022.9895629"},{"key":"e_1_3_2_1_55_1","first-page":"350","volume-title":"IEEE International Solid-State Circuits Conference, ISSCC 2021","author":"Kwon Young-Cheon","year":"2021","unstructured":"Young-Cheon Kwon, Suk Han Lee, Jaehoon Lee, Sang-Hyuk Kwon, Je-Min Ryu, Jong-Pil Son, Seongil O, Hak-soo Yu, Haesuk Lee, Soo Young Kim, Youngmin Cho, Jin Guk Kim, Jongyoon Choi, Hyunsung Shin, Jin Kim, BengSeng Phuah, HyoungMin Kim, Myeong Jun Song, Ahn Choi, Daeho Kim, Sooyoung Kim, Eun-Bong Kim, David Wang, Shinhaeng Kang, Yuhwan Ro, Seungwoo Seo, Joon-Ho Song, Jaeyoun Youn, Kyomin Sohn, and Nam Sung Kim. 25.4 A 20nm 6gb function-in-memory dram, based on HBM2 with a 1.2tflops programmable computing unit using bank-level parallelism, for machine learning applications. In IEEE International Solid-State Circuits Conference, ISSCC 2021, San Francisco, CA, USA, February 13-22, 2021, pages 350--352. IEEE, 2021."},{"key":"e_1_3_2_1_56_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358284"},{"key":"e_1_3_2_1_58_1","doi-asserted-by":"publisher","DOI":"10.1109\/BIBM49941.2020.9313351"},{"key":"e_1_3_2_1_59_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00013"},{"key":"e_1_3_2_1_60_1","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2021.3077294"},{"key":"e_1_3_2_1_61_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC56929.2023.10247764"},{"key":"e_1_3_2_1_62_1","doi-asserted-by":"publisher","DOI":"10.1145\/3589258"},{"key":"e_1_3_2_1_63_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480090"},{"key":"e_1_3_2_1_64_1","volume-title":"Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692, 2019."},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00986"},{"key":"e_1_3_2_1_66_1","volume-title":"Embedded binarized neural networks. CoRR, abs\/1709.02260","author":"McDanel Bradley","year":"2017","unstructured":"Bradley McDanel, Surat Teerapittayanon, and H. T. Kung. Embedded binarized neural networks. CoRR, abs\/1709.02260, 2017."},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2017.54"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.5555\/3104322.3104425"},{"key":"e_1_3_2_1_69_1","unstructured":"Nvidia. Cublas. https:\/\/developer.nvidia.com\/cublas."},{"key":"e_1_3_2_1_70_1","unstructured":"Nvidia. Cudnn. https:\/\/developer.nvidia.com\/cudnn."},{"key":"e_1_3_2_1_71_1","unstructured":"Samsung Advanced Institute of Technology. Pimsimulator. https:\/\/github.com\/SAITPublic\/PIMSimulator."},{"key":"e_1_3_2_1_72_1","first-page":"235","volume-title":"2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)","author":"Oliveira Geraldo F","year":"2023","unstructured":"Geraldo F Oliveira, Juan G\u00f3mez-Luna, Mohammad Sadrosadati, Yuxin Guo, and Onur Mutlu. Transpimlib: Efficient transcendental functions for processing-in-memory systems. In 2023 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS), pages 235--247. IEEE, 2023."},{"key":"e_1_3_2_1_73_1","unstructured":"Open Neural Network Exchange (ONNX). https:\/\/github.com\/onnx\/onnx."},{"key":"e_1_3_2_1_74_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISPASS.2019.00042"},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1145\/3466752.3480080"},{"key":"e_1_3_2_1_76_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_77_1","doi-asserted-by":"publisher","DOI":"10.1109\/40.592312"},{"issue":"8","key":"e_1_3_2_1_78_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.5555\/3455716.3455856"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2018.2876312"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359658"},{"key":"e_1_3_2_1_82_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2021.3066909"},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/LCA.2020.3011643"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3570361.3613285"},{"key":"e_1_3_2_1_85_1","unstructured":"UPMEM. Upmem pim-dimm runtime library. https:\/\/sdk.upmem.com\/2023.1.0\/202_RTL.html."},{"key":"e_1_3_2_1_86_1","unstructured":"UPMEM. Upmem software development kit (sdk). https:\/\/sdk.upmem.com\/."},{"key":"e_1_3_2_1_87_1","volume-title":"Advances in Neural Information Processing Systems","author":"Vaswani Ashish","year":"2017","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. Von Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems, volume 30. Curran Associates, Inc., 2017."},{"key":"e_1_3_2_1_88_1","volume-title":"Glue: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461","author":"Wang Alex","year":"2018","unstructured":"Alex Wang, Amanpreet Singh, Julian Michael, Felix Hill, Omer Levy, and Samuel R Bowman. Glue: A multi-task benchmark and analysis platform for natural language understanding. arXiv preprint arXiv:1804.07461, 2018."},{"key":"e_1_3_2_1_89_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00061"},{"key":"e_1_3_2_1_90_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2791440"},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00055"},{"key":"e_1_3_2_1_92_1","volume-title":"Xlnet: Generalized autoregressive pre-training for language understanding. Advances in neural information processing systems, 32","author":"Yang Zhilin","year":"2019","unstructured":"Zhilin Yang, Zihang Dai, Yiming Yang, Jaime Carbonell, Russ R Salakhutdinov, and Quoc V Le. Xlnet: Generalized autoregressive pre-training for language understanding. Advances in neural information processing systems, 32, 2019."},{"key":"e_1_3_2_1_93_1","volume-title":"7th International Conference on Learning Representations, ICLR 2019","author":"Yin Penghang","year":"2019","unstructured":"Penghang Yin, Jiancheng Lyu, Shuai Zhang, Stanley J. Osher, Yingyong Qi, and Jack Xin. Understanding straight-through estimator in training activation quantized neural nets. In 7th International Conference on Learning Representations, ICLR 2019, New Orleans, LA, USA, May 6-9, 2019. OpenReview.net, 2019."},{"key":"e_1_3_2_1_94_1","doi-asserted-by":"publisher","DOI":"10.1109\/TPDS.2018.2858230"},{"key":"e_1_3_2_1_95_1","first-page":"558","volume-title":"Proceedings of the IEEE\/CVF international conference on computer vision","author":"Yuan Li","year":"2021","unstructured":"Li Yuan, Yunpeng Chen, Tao Wang, Weihao Yu, Yujun Shi, Zi-Hang Jiang, Francis EH Tay, Jiashi Feng, and Shuicheng Yan. Tokens-to-token vit: Training vision transformers from scratch on imagenet. In Proceedings of the IEEE\/CVF international conference on computer vision, pages 558--567, 2021."},{"key":"e_1_3_2_1_96_1","doi-asserted-by":"publisher","DOI":"10.1145\/2600212.2600213"},{"key":"e_1_3_2_1_97_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00053"},{"key":"e_1_3_2_1_98_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA.2018.00053"},{"key":"e_1_3_2_1_99_1","volume-title":"Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, et al. Opt: Open pre-trained transformer language models. arXiv preprint arXiv:2205.01068, 2022."},{"key":"e_1_3_2_1_100_1","first-page":"1","volume-title":"Proceedings of the 41st IEEE\/ACM International Conference on Computer-Aided Design","author":"Zhou Ranyang","year":"2022","unstructured":"Ranyang Zhou, Arman Roohi, Durga Misra, and Shaahin Angizi. Red-lut: Reconfigurable in-dram luts enabling massive parallel computation. In Proceedings of the 41st IEEE\/ACM International Conference on Computer-Aided Design, pages 1--8, 2022."},{"key":"e_1_3_2_1_101_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569670"},{"key":"e_1_3_2_1_102_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA56546.2023.10071005"},{"key":"e_1_3_2_1_103_1","first-page":"489","volume-title":"2022 USENIX Annual Technical Conference (USENIX ATC 22)","author":"Zhou Zhe","year":"2022","unstructured":"Zhe Zhou, Xuechao Wei, Jiejing Zhang, and Guangyu Sun. PetS: A unified framework for Parameter-Efficient transformers serving. In 2022 USENIX Annual Technical Conference (USENIX ATC 22), pages 489--504, Carlsbad, CA, July 2022. USENIX Association."},{"key":"e_1_3_2_1_104_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358256"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640376","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640376","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:41Z","timestamp":1750291421000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640376"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":104,"alternative-id":["10.1145\/3620665.3640376","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640376","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}