{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,29]],"date-time":"2025-08-29T10:43:16Z","timestamp":1756464196962,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":52,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,10,13]],"date-time":"2024-10-13T00:00:00Z","timestamp":1728777600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"name":"Institute of Information & Communications Technology Planning & Evaluation","award":["2021-0-01817"],"award-info":[{"award-number":["2021-0-01817"]}]},{"DOI":"10.13039\/501100006374","name":"National Research Foundation of Korea","doi-asserted-by":"publisher","award":["NRF-2021R1A2C1011482"],"award-info":[{"award-number":["NRF-2021R1A2C1011482"]}],"id":[{"id":"10.13039\/501100006374","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Samsung Advanced Institute of Technology, Samsung Electronics Company Ltd."}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,10,14]]},"DOI":"10.1145\/3656019.3676945","type":"proceedings-article","created":{"date-parts":[[2024,10,11]],"date-time":"2024-10-11T10:34:08Z","timestamp":1728642848000},"page":"78-90","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":1,"title":["Activation Sequence Caching: High-Throughput and Memory-Efficient Generative Inference with a Single GPU"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0009-0002-9586-1593","authenticated-orcid":false,"given":"Sowoong","family":"Kim","sequence":"first","affiliation":[{"name":"UNIST, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-3886-7523","authenticated-orcid":false,"given":"Eunyeong","family":"Sim","sequence":"additional","affiliation":[{"name":"UNIST, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-6978-5780","authenticated-orcid":false,"given":"Youngsam","family":"Shin","sequence":"additional","affiliation":[{"name":"Samsung Advanced Institute of Technology, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-1242-6343","authenticated-orcid":false,"given":"YeonGon","family":"Cho","sequence":"additional","affiliation":[{"name":"Samsung Advanced Institute of Technology, Republic of Korea"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1877-7307","authenticated-orcid":false,"given":"Woongki","family":"Baek","sequence":"additional","affiliation":[{"name":"UNIST, Republic of Korea"}]}],"member":"320","published-online":{"date-parts":[[2024,10,13]]},"reference":[{"key":"e_1_3_2_1_1_1","unstructured":"2017. NVIDIA V100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/v100\/."},{"key":"e_1_3_2_1_2_1","unstructured":"2020. Hugging Face Accelerate. https:\/\/huggingface.co\/docs\/accelerate\/index."},{"key":"e_1_3_2_1_3_1","unstructured":"2020. NVIDIA A100 Tensor Core GPU. https:\/\/www.nvidia.com\/en-us\/data-center\/a100\/."},{"key":"e_1_3_2_1_4_1","unstructured":"2022. 990 Pro PCIe\u00ae 4.0 NVMe\u00ae SSD 2TB. https:\/\/www.samsung.com\/us\/computing\/memory-storage\/solid-state-drives\/990-pro-pcie-4-0-nvme-ssd-2tb-mz-v9p2t0b-am\/."},{"key":"e_1_3_2_1_5_1","unstructured":"2022. CUDA Toolkit. https:\/\/developer.nvidia.com\/cuda-toolkit."},{"key":"e_1_3_2_1_6_1","unstructured":"2022. metaseq\/\/projects\/\/OPT. https:\/\/github.com\/facebookresearch\/metaseq\/tree\/main\/projects\/OPT."},{"key":"e_1_3_2_1_7_1","unstructured":"2023. GitHub - FMInference\/FlexGen: Running large language models on a single GPU for throughput-oriented scenarios.https:\/\/github.com\/FMInference\/FlexGen."},{"key":"e_1_3_2_1_8_1","volume-title":"SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arxiv:2308.16369\u00a0[cs.LG]","author":"Agrawal Amey","year":"2023","unstructured":"Amey Agrawal, Ashish Panwar, Jayashree Mohan, Nipun Kwatra, Bhargav\u00a0S. Gulavani, and Ramachandran Ramjee. 2023. SARATHI: Efficient LLM Inference by Piggybacking Decodes with Chunked Prefills. arxiv:2308.16369\u00a0[cs.LG]"},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00051"},{"key":"e_1_3_2_1_10_1","volume-title":"Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates","author":"Brown Tom","year":"1877","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates, Inc., 1877\u20131901. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2020\/file\/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf"},{"key":"e_1_3_2_1_11_1","volume-title":"Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Yan, Meghan Cowan, Haichen Shen, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. 2018. TVM: an automated end-to-end optimizing compiler for deep learning. In Proceedings of the 13th USENIX Conference on Operating Systems Design and Implementation (Carlsbad, CA, USA) (OSDI\u201918). USENIX Association, USA, 579\u2013594."},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.23919\/DATE.2018.8341972"},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation","author":"Chilimbi Trishul","year":"2014","unstructured":"Trishul Chilimbi, Yutaka Suzue, Johnson Apacible, and Karthik Kalyanaraman. 2014. Project Adam: Building an Efficient and Scalable Deep Learning Training System. In Proceedings of the 11th USENIX Conference on Operating Systems Design and Implementation (Broomfield, CO) (OSDI\u201914). USENIX Association, Berkeley, CA, USA, 571\u2013582. http:\/\/dl.acm.org\/citation.cfm?id=2685048.2685094"},{"key":"e_1_3_2_1_14_1","volume-title":"Proceedings of the 25th International Conference on Neural Information Processing Systems -","volume":"1","author":"Dean Jeffrey","year":"2012","unstructured":"Jeffrey Dean, Greg\u00a0S. Corrado, Rajat Monga, Kai Chen, Matthieu Devin, Quoc\u00a0V. Le, Mark\u00a0Z. Mao, Marc\u2019Aurelio Ranzato, Andrew Senior, Paul Tucker, Ke Yang, and Andrew\u00a0Y. Ng. 2012. Large scale distributed deep networks. In Proceedings of the 25th International Conference on Neural Information Processing Systems - Volume 1 (Lake Tahoe, Nevada) (NIPS\u201912). Curran Associates Inc., Red Hook, NY, USA, 1223\u20131231."},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.adh4451"},{"key":"e_1_3_2_1_16_1","unstructured":"Roberto Gozalo-Brizuela and Eduardo\u00a0C. Garrido-Merch\u00e1n. 2023. A survey of Generative AI Applications. arxiv:2306.02781\u00a0[cs.LG]"},{"key":"e_1_3_2_1_17_1","doi-asserted-by":"publisher","unstructured":"Myeonggyun Han and Woongki Baek. 2021. HERTI: A Reinforcement Learning-Augmented System for Efficient Real-Time Inference on Heterogeneous Embedded Systems. In 2021 30th International Conference on Parallel Architectures and Compilation Techniques (PACT). 90\u2013102. https:\/\/doi.org\/10.1109\/PACT52795.2021.00014","DOI":"10.1109\/PACT52795.2021.00014"},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1109\/TC.2019.2949408"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1109\/PACT.2019.00021"},{"key":"e_1_3_2_1_20_1","unstructured":"Mark Harris. 2012. How to Optimize Data Transfers in CUDA C\/C++. https:\/\/developer.nvidia.com\/blog\/how-optimize-data-transfers-cuda-cc\/."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378465"},{"key":"e_1_3_2_1_22_1","doi-asserted-by":"publisher","DOI":"10.1109\/SC41404.2022.00016"},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378530"},{"volume-title":"GPipe: efficient training of giant neural networks using pipeline parallelism","author":"Huang Yanping","key":"e_1_3_2_1_24_1","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Mia\u00a0Xu Chen, Dehao Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V. Le, Yonghui Wu, and Zhifeng Chen. 2019. GPipe: efficient training of giant neural networks using pipeline parallelism. Curran Associates Inc., Red Hook, NY, USA."},{"key":"e_1_3_2_1_25_1","volume-title":"Proceedings of Machine Learning and Systems, A.\u00a0Smola, A.\u00a0Dimakis, and I.\u00a0Stoica (Eds.). Vol.\u00a03. 711\u2013732","author":"Ivanov Andrei","year":"2021","unstructured":"Andrei Ivanov, Nikoli Dryden, Tal Ben-Nun, Shigang Li, and Torsten Hoefler. 2021. Data Movement Is All You Need: A Case Study on Optimizing Transformers. In Proceedings of Machine Learning and Systems, A.\u00a0Smola, A.\u00a0Dimakis, and I.\u00a0Stoica (Eds.). Vol.\u00a03. 711\u2013732. https:\/\/proceedings.mlsys.org\/paper_files\/paper\/2021\/file\/bc86e95606a6392f51f95a8de106728d-Paper.pdf"},{"key":"e_1_3_2_1_26_1","unstructured":"Sam\u00a0Ade Jacobs Masahiro Tanaka Chengming Zhang Minjia Zhang Shuaiwen\u00a0Leon Song Samyam Rajbhandari and Yuxiong He. 2023. DeepSpeed Ulysses: System Optimizations for Enabling Training of Extreme Long Sequence Transformer Models. arxiv:2309.14509\u00a0[cs.LG]"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.14778\/3551793.3551828"},{"key":"e_1_3_2_1_28_1","volume-title":"AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23)","author":"Li Zhuohan","year":"2023","unstructured":"Zhuohan Li, Lianmin Zheng, Yinmin Zhong, Vincent Liu, Ying Sheng, Xin Jin, Yanping Huang, Zhifeng Chen, Hao Zhang, Joseph\u00a0E. Gonzalez, and Ion Stoica. 2023. AlpaServe: Statistical Multiplexing with Model Parallelism for Deep Learning Serving. In 17th USENIX Symposium on Operating Systems Design and Implementation (OSDI 23). USENIX Association, Boston, MA, 663\u2013679. https:\/\/www.usenix.org\/conference\/osdi23\/presentation\/li-zhouhan"},{"key":"e_1_3_2_1_29_1","unstructured":"Justin Luitjens. 2014. CUDA Streams: Best Practices and Common Pitfalls. https:\/\/on-demand.gputechconf.com\/gtc\/2014\/presentations\/S4158-cuda-streams-best-practices-common-pitfalls.pdf."},{"key":"e_1_3_2_1_30_1","volume-title":"Proceedings of the 34th International Conference on Machine Learning, ICML 2017","author":"Mirhoseini Azalia","year":"2017","unstructured":"Azalia Mirhoseini, Hieu Pham, Quoc\u00a0V. Le, Benoit Steiner, Rasmus Larsen, Yuefeng Zhou, Naveen Kumar, Mohammad Norouzi, Samy Bengio, and Jeff Dean. 2017. Device Placement Optimization with Reinforcement Learning. In Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, 6-11 August 2017(ICML\u201917). 2430\u20132439. http:\/\/proceedings.mlr.press\/v70\/mirhoseini17a.html"},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359646"},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"volume-title":"PyTorch: An Imperative Style","author":"Paszke Adam","key":"e_1_3_2_1_33_1","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Advances in Neural Information Processing Systems 32, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Curran Associates, Inc., 8024\u20138035. http:\/\/papers.neurips.cc\/paper\/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3373376.3378505"},{"key":"e_1_3_2_1_35_1","volume-title":"Proceedings of Machine Learning and Systems 5","author":"Pope Reiner","year":"2023","unstructured":"Reiner Pope, Sholto Douglas, Aakanksha Chowdhery, Jacob Devlin, James Bradbury, Jonathan Heek, Kefan Xiao, Shivani Agrawal, and Jeff Dean. 2023. Efficiently scaling transformer inference. Proceedings of Machine Learning and Systems 5 (2023)."},{"key":"e_1_3_2_1_36_1","unstructured":"Bharadwaj Pudipeddi Maral Mesmakhosroshahi Jinwen Xi and Sujeeth Bharadwaj. 2020. Training Large Neural Networks with Constant Memory using a New Execution Algorithm. arxiv:2002.05645\u00a0[cs.LG]"},{"key":"e_1_3_2_1_37_1","volume-title":"Proceedings of the 39th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0162)","author":"Rajbhandari Samyam","year":"2022","unstructured":"Samyam Rajbhandari, Conglong Li, Zhewei Yao, Minjia Zhang, Reza\u00a0Yazdani Aminabadi, Ammar\u00a0Ahmad Awan, Jeff Rasley, and Yuxiong He. 2022. DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale. In Proceedings of the 39th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0162), Kamalika Chaudhuri, Stefanie Jegelka, Le\u00a0Song, Csaba Szepesvari, Gang Niu, and Sivan Sabato (Eds.). PMLR, 18332\u201318346. https:\/\/proceedings.mlr.press\/v162\/rajbhandari22a.html"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis","author":"Rajbhandari Samyam","year":"2020","unstructured":"Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, and Yuxiong He. 2020. ZeRO: memory optimizations toward training trillion parameter models. In Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis (Atlanta, Georgia) (SC \u201920). IEEE Press, Article 20, 16\u00a0pages."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_40_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00057"},{"key":"e_1_3_2_1_41_1","volume-title":"ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza\u00a0Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. 2021. ZeRO-Offload: Democratizing Billion-Scale Model Training. In 2021 USENIX Annual Technical Conference (USENIX ATC 21). USENIX Association, 551\u2013564. https:\/\/www.usenix.org\/conference\/atc21\/presentation\/ren-jie"},{"key":"e_1_3_2_1_42_1","volume-title":"Memory-efficient Neural Network Design. In The 49th Annual IEEE\/ACM International Symposium on Microarchitecture","author":"Rhu Minsoo","year":"2016","unstructured":"Minsoo Rhu, Natalia Gimelshein, Jason Clemons, Arslan Zulfiqar, and Stephen\u00a0W. Keckler. 2016. vDNN: Virtualized Deep Neural Networks for Scalable, Memory-efficient Neural Network Design. In The 49th Annual IEEE\/ACM International Symposium on Microarchitecture (Taipei, Taiwan) (MICRO-49). IEEE Press, Piscataway, NJ, USA, Article 18, 13\u00a0pages. http:\/\/dl.acm.org\/citation.cfm?id=3195638.3195660"},{"key":"e_1_3_2_1_43_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202)","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Beidi Chen, Percy Liang, Christopher Re, Ion Stoica, and Ce Zhang. 2023. FlexGen: High-Throughput Generative Inference of Large Language Models with a Single GPU. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, 31094\u201331116. https:\/\/proceedings.mlr.press\/v202\/sheng23a.html"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.5555\/3295222.3295349"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3178487.3178491"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.1145\/3559009.3569674"},{"key":"e_1_3_2_1_47_1","unstructured":"Qian Yang Zhouyuan Huo Wenlin Wang and Lawrence Carin. 2019. Ouroboros: On Accelerating Training of Transformer-Based Language Models. In Advances in Neural Information Processing Systems H.\u00a0Wallach H.\u00a0Larochelle A.\u00a0Beygelzimer F.\u00a0d'Alch\u00e9-Buc E.\u00a0Fox and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates Inc.https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/1b79b52d1bf6f71b2b1eb7ca08ed0776-Paper.pdf"},{"key":"e_1_3_2_1_48_1","volume-title":"Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Yu Gyeong-In","year":"2022","unstructured":"Gyeong-In Yu, Joo\u00a0Seong Jeong, Geon-Woo Kim, Soojeong Kim, and Byung-Gon Chun. 2022. Orca: A Distributed Serving System for Transformer-Based Generative Models. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22). USENIX Association, Carlsbad, CA, 521\u2013538. https:\/\/www.usenix.org\/conference\/osdi22\/presentation\/yu"},{"key":"e_1_3_2_1_49_1","volume-title":"Deep generative molecular design reshapes drug discovery. Cell Reports Medicine 3, 12","author":"Zeng Xiangxiang","year":"2022","unstructured":"Xiangxiang Zeng, Fei Wang, Yuan Luo, Seung-gu Kang, Jian Tang, Felice\u00a0C Lightstone, Evandro\u00a0F Fang, Wendy Cornell, Ruth Nussinov, and Feixiong Cheng. 2022. Deep generative molecular design reshapes drug discovery. Cell Reports Medicine 3, 12 (2022)."},{"key":"e_1_3_2_1_50_1","volume-title":"Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17)","author":"Zhang Hao","year":"2017","unstructured":"Hao Zhang, Zeyu Zheng, Shizhen Xu, Wei Dai, Qirong Ho, Xiaodan Liang, Zhiting Hu, Jinliang Wei, Pengtao Xie, and Eric\u00a0P. Xing. 2017. Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters. In 2017 USENIX Annual Technical Conference (USENIX ATC 17). USENIX Association, Santa Clara, CA, 181\u2013193. https:\/\/www.usenix.org\/conference\/atc17\/technical-sessions\/presentation\/zhang"},{"key":"e_1_3_2_1_51_1","volume-title":"OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi\u00a0Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, Daniel Simig, Punit\u00a0Singh Koura, Anjali Sridhar, Tianlu Wang, and Luke Zettlemoyer. 2022. OPT: Open Pre-trained Transformer Language Models. arxiv:2205.01068\u00a0[cs.CL]"},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3410463.3414642"}],"event":{"name":"PACT '24: International Conference on Parallel Architectures and Compilation Techniques","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Long Beach CA USA","acronym":"PACT '24"},"container-title":["Proceedings of the 2024 International Conference on Parallel Architectures and Compilation Techniques"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676945","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3656019.3676945","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T19:54:59Z","timestamp":1755892499000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3656019.3676945"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,10,13]]},"references-count":52,"alternative-id":["10.1145\/3656019.3676945","10.1145\/3656019"],"URL":"https:\/\/doi.org\/10.1145\/3656019.3676945","relation":{},"subject":[],"published":{"date-parts":[[2024,10,13]]},"assertion":[{"value":"2024-10-13","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}