{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,12]],"date-time":"2026-05-12T15:42:18Z","timestamp":1778600538354,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":34,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"SRC\/DARPA JUMP 2.0 CoCoSys: Center for the Co-design of Cognitive Systems","award":["AWD-004311-S5"],"award-info":[{"award-number":["AWD-004311-S5"]}]},{"name":"SRC AIHW","award":["2023-AI-3158"],"award-info":[{"award-number":["2023-AI-3158"]}]},{"name":"Precourt Institute for Energy","award":["MitraPPP-EEC"],"award-info":[{"award-number":["MitraPPP-EEC"]}]},{"name":"Samsung (HLS award)","award":["SPO #296426"],"award-info":[{"award-number":["SPO #296426"]}]},{"name":"Apple Stanford EE PhD Fellowship in Integrated Systems"},{"name":"NSF FuSe-TG","award":["2235462"],"award-info":[{"award-number":["2235462"]}]},{"name":"AI Chip Center for Emerging Smart Systems (ACCESS), Hong Kong SAR","award":["183962"],"award-info":[{"award-number":["183962"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620666.3651368","type":"proceedings-article","created":{"date-parts":[[2024,4,24]],"date-time":"2024-04-24T12:08:21Z","timestamp":1713960501000},"page":"5-21","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":22,"title":["8-bit Transformer Inference and Fine-tuning for Edge Accelerators"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9643-7490","authenticated-orcid":false,"given":"Jeffrey","family":"Yu","sequence":"first","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4179-1692","authenticated-orcid":false,"given":"Kartik","family":"Prabhu","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5763-8174","authenticated-orcid":false,"given":"Yonatan","family":"Urman","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-3393-5489","authenticated-orcid":false,"given":"Robert M.","family":"Radway","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0009-0528-8318","authenticated-orcid":false,"given":"Eric","family":"Han","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, USA"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-8834-8663","authenticated-orcid":false,"given":"Priyanka","family":"Raina","sequence":"additional","affiliation":[{"name":"Electrical Engineering, Stanford University, Stanford, CA, United States of America"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"Scalable methods for 8-bit training of neural networks","author":"Banner Ron","year":"2018","unstructured":"Ron Banner, Itay Hubara, Elad Hoffer, and Daniel Soudry. Scalable methods for 8-bit training of neural networks, 2018. arXiv:1805. 11046."},{"key":"e_1_3_2_1_2_1","volume-title":"Language models are few-shot learners","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. Language models are few-shot learners, 2020. arXiv:2005.14165."},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1038\/s41598-021-94691-7"},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58452-8_13"},{"key":"e_1_3_2_1_5_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00950"},{"key":"e_1_3_2_1_6_1","doi-asserted-by":"publisher","DOI":"10.3390\/s20051515"},{"key":"e_1_3_2_1_7_1","volume-title":"8-bit matrix multiplication for transformers at scale","author":"Dettmers Tim","year":"2022","unstructured":"Tim Dettmers, Mike Lewis, Younes Belkada, and Luke Zettlemoyer. LLM.int8(): 8-bit matrix multiplication for transformers at scale, 2022. arXiv:2208.07339."},{"key":"e_1_3_2_1_8_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_1_9_1","volume-title":"An image is worth 16\u00d716 words: Transformers for image recognition at scale","author":"Dosovitskiy Alexey","year":"2021","unstructured":"Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby. An image is worth 16\u00d716 words: Transformers for image recognition at scale, 2021. arXiv:2010.11929."},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.14529\/jsfi170206"},{"key":"e_1_3_2_1_11_1","volume-title":"Distilling the knowledge in a neural network","author":"Hinton Geoffrey","year":"2015","unstructured":"Geoffrey Hinton, Oriol Vinyals, and Jeff Dean. Distilling the knowledge in a neural network, 2015. arXiv:1503.02531."},{"key":"e_1_3_2_1_12_1","series-title":"Proceedings of Machine Learning Research","first-page":"2790","volume-title":"Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. Parameter-efficient transfer learning for NLP","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. Parameter-efficient transfer learning for NLP. In Kamalika Chaudhuri and Ruslan Salakhutdinov, editors, Proceedings of the 36th International Conference on Machine Learning, volume 97 of Proceedings of Machine Learning Research, pages 2790--2799. PMLR, June 2019. URL: https:\/\/proceedings.mlr.press\/v97\/houlsby19a.html."},{"key":"e_1_3_2_1_13_1","volume-title":"Lora: Low-rank adaptation of large language models","author":"Hu Edward J.","year":"2021","unstructured":"Edward J. Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: Low-rank adaptation of large language models, 2021. arXiv:2106.09685."},{"key":"e_1_3_2_1_14_1","volume-title":"Rethinking floating point for deep learning","author":"Johnson Jeff","year":"2018","unstructured":"Jeff Johnson. Rethinking floating point for deep learning, 2018. arXiv: 1811.01721."},{"key":"e_1_3_2_1_15_1","volume-title":"RoBERTa: A robustly optimized bert pretraining approach","author":"Liu Yinhan","year":"2019","unstructured":"Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. RoBERTa: A robustly optimized bert pretraining approach, 2019. arXiv:1907.11692."},{"key":"e_1_3_2_1_16_1","volume-title":"Swin Transformer: Hierarchical vision transformer using shifted windows","author":"Liu Ze","year":"2021","unstructured":"Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, and Baining Guo. Swin Transformer: Hierarchical vision transformer using shifted windows, 2021. arXiv:2103.14030."},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2020.2985971"},{"key":"e_1_3_2_1_18_1","volume-title":"Mixed precision training","author":"Micikevicius Paulius","year":"2018","unstructured":"Paulius Micikevicius, Sharan Narang, Jonah Alben, Gregory Diamos, Erich Elsen, David Garcia, Boris Ginsburg, Michael Houston, Oleksii Kuchaiev, Ganesh Venkatesh, and Hao Wu. Mixed precision training, 2018. arXiv:1710.03740."},{"key":"e_1_3_2_1_19_1","volume-title":"FP8 formats for deep learning","author":"Micikevicius Paulius","year":"2022","unstructured":"Paulius Micikevicius, Dusan Stosic, Neil Burgess, Marius Cornea, Pradeep Dubey, Richard Grisenthwaite, Sangwon Ha, Alexander Heinecke, Patrick Judd, John Kamalu, Naveen Mellempudi, Stuart Oberman, Mohammad Shoeybi, Michael Siu, and Hao Wu. FP8 formats for deep learning, 2022. arXiv:2209.05433."},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.1109\/TETC.2021.3109127"},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2015.7178964"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/JSSC.2022.3140753"},{"key":"e_1_3_2_1_23_1","volume-title":"Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. Robust speech recognition via large-scale weak supervision","author":"Radford Alec","year":"2022","unstructured":"Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, and Ilya Sutskever. Robust speech recognition via large-scale weak supervision, 2022. arXiv:2212.04356."},{"issue":"140","key":"e_1_3_2_1_24_1","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu. Exploring the limits of transfer learning with a unified text-to-text transformer. Journal of Machine Learning Research, 21(140):1--67, 2020. URL: http:\/\/jmlr.org\/papers\/v21\/20-074.html.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D16-1264"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP39728.2021.9413919"},{"key":"e_1_3_2_1_27_1","volume-title":"Segmenter: Transformer for semantic segmentation","author":"Strudel Robin","year":"2021","unstructured":"Robin Strudel, Ricardo Garcia, Ivan Laptev, and Cordelia Schmid. Segmenter: Transformer for semantic segmentation, 2021. arXiv: 2105.05633."},{"key":"e_1_3_2_1_28_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"e_1_3_2_1_29_1","doi-asserted-by":"publisher","DOI":"10.48550\/arXiv.1706.03762"},{"key":"e_1_3_2_1_30_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_2_1_31_1","volume-title":"Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. HuggingFace's Transformers: State-of-the-art natural language processing","author":"Wolf Thomas","year":"2020","unstructured":"Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Clement Delangue, Anthony Moi, Pierric Cistac, Tim Rault, R\u00e9mi Louf, Morgan Funtowicz, Joe Davison, Sam Shleifer, Patrick von Platen, Clara Ma, Yacine Jernite, Julien Plu, Canwen Xu, Teven Le Scao, Sylvain Gugger, Mariama Drame, Quentin Lhoest, and Alexander M. Rush. HuggingFace's Transformers: State-of-the-art natural language processing, 2020. arXiv:1910.03771."},{"key":"e_1_3_2_1_32_1","volume-title":"Training transformers with 4-bit integers","author":"Xi Haocheng","year":"2023","unstructured":"Haocheng Xi, Changhao Li, Jianfei Chen, and Jun Zhu. Training transformers with 4-bit integers, 2023. arXiv:2306.11987."},{"key":"e_1_3_2_1_33_1","volume-title":"SmoothQuant: Accurate and efficient post-training quantization for large language models","author":"Xiao Guangxuan","year":"2023","unstructured":"Guangxuan Xiao, Ji Lin, Mickael Seznec, Hao Wu, Julien Demouth, and Song Han. SmoothQuant: Accurate and efficient post-training quantization for large language models, 2023. arXiv:2211.10438."},{"key":"e_1_3_2_1_34_1","volume-title":"Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models","author":"Zaken Elad Ben","year":"2022","unstructured":"Elad Ben Zaken, Shauli Ravfogel, and Yoav Goldberg. Bitfit: Simple parameter-efficient fine-tuning for transformer-based masked language-models, 2022. arXiv:2106.10199."}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 3"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651368","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:43Z","timestamp":1750291423000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620666.3651368"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":34,"alternative-id":["10.1145\/3620666.3651368","10.1145\/3620666"],"URL":"https:\/\/doi.org\/10.1145\/3620666.3651368","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}