{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,25]],"date-time":"2026-04-25T08:35:42Z","timestamp":1777106142308,"version":"3.51.4"},"publisher-location":"New York, NY, USA","reference-count":94,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,4,27]],"date-time":"2024-04-27T00:00:00Z","timestamp":1714176000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62222210"],"award-info":[{"award-number":["62222210"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U21B2017"],"award-info":[{"award-number":["U21B2017"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62072297"],"award-info":[{"award-number":["62072297"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Key R&D Program of China","award":["2021ZD0110104"],"award-info":[{"award-number":["2021ZD0110104"]}]}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,4,27]]},"DOI":"10.1145\/3620665.3640423","type":"proceedings-article","created":{"date-parts":[[2024,4,22]],"date-time":"2024-04-22T14:18:06Z","timestamp":1713795486000},"page":"450-466","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":23,"title":["GMLake: Efficient and Transparent GPU Memory Defragmentation for Large-scale DNN Training with Virtual Memory Stitching"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-4479-5525","authenticated-orcid":false,"given":"Cong","family":"Guo","sequence":"first","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0002-3043-7775","authenticated-orcid":false,"given":"Rui","family":"Zhang","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0007-4694-6597","authenticated-orcid":false,"given":"Jiale","family":"Xu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5660-5493","authenticated-orcid":false,"given":"Jingwen","family":"Leng","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0874-0682","authenticated-orcid":false,"given":"Zihan","family":"Liu","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0000-5516-2734","authenticated-orcid":false,"given":"Ziyu","family":"Huang","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0034-2302","authenticated-orcid":false,"given":"Minyi","family":"Guo","sequence":"additional","affiliation":[{"name":"Shanghai Jiao Tong University, Shanghai, China"},{"name":"Shanghai Qi Zhi Institute, Shanghai, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-2630-8361","authenticated-orcid":false,"given":"Hao","family":"Wu","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-0574-245X","authenticated-orcid":false,"given":"Shouren","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0003-5268-2541","authenticated-orcid":false,"given":"Junping","family":"Zhao","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}]},{"ORCID":"https:\/\/orcid.org\/0009-0008-6685-1293","authenticated-orcid":false,"given":"Ke","family":"Zhang","sequence":"additional","affiliation":[{"name":"Ant Group, Beijing, China"}]}],"member":"320","published-online":{"date-parts":[[2024,4,27]]},"reference":[{"key":"e_1_3_2_1_1_1","volume-title":"et al. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467","author":"Abadi Mart\u00edn","year":"2016","unstructured":"Mart\u00edn Abadi, Ashish Agarwal, Paul Barham, Eugene Brevdo, Zhifeng Chen, Craig Citro, Greg S Corrado, Andy Davis, Jeffrey Dean, Matthieu Devin, et al. Tensorflow: Large-scale machine learning on heterogeneous distributed systems. arXiv preprint arXiv:1603.04467, 2016."},{"key":"e_1_3_2_1_2_1","doi-asserted-by":"publisher","DOI":"10.1109\/AICAS54282.2022.9869991"},{"key":"e_1_3_2_1_3_1","first-page":"451","volume-title":"Proceedings of the 2015 ACM SIGPLAN International Conference on Object-Oriented Programming, Systems, Languages, and Applications, OOPSLA 2015, part of SPLASH 2015","author":"Aigner Martin","year":"2015","unstructured":"Martin Aigner, Christoph M. Kirsch, Michael Lippautz, and Ana Sokolova. Fast, multicore-scalable, low-fragmentation memory allocation through large virtual memory and global data structures. In Jonathan Aldrich and Patrick Eugster, editors, Proceedings of the 2015 ACM SIGPLAN International Conference on Object-Oriented Programming, Systems, Languages, and Applications, OOPSLA 2015, part of SPLASH 2015, Pittsburgh, PA, USA, October 25-30, 2015, pages 451--469. ACM, 2015."},{"key":"e_1_3_2_1_4_1","first-page":"23844","article-title":"Efficient combination of rematerialization and offloading for training dnns","volume":"34","author":"Beaumont Olivier","year":"2021","unstructured":"Olivier Beaumont, Lionel Eyraud-Dubois, and Alena Shilova. Efficient combination of rematerialization and offloading for training dnns. Advances in Neural Information Processing Systems, 34:23844--23857, 2021.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_5_1","volume-title":"Unlimiformer: Long-range transformers with unlimited length input. CoRR, abs\/2305.01625","author":"Bertsch Amanda","year":"2023","unstructured":"Amanda Bertsch, Uri Alon, Graham Neubig, and Matthew R. Gormley. Unlimiformer: Long-range transformers with unlimited length input. CoRR, abs\/2305.01625, 2023."},{"key":"e_1_3_2_1_6_1","volume-title":"Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, and Samuel Weinbach. Gpt-neox-20b: An open-source autoregressive language model","author":"Black Sid","year":"2022","unstructured":"Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, and Samuel Weinbach. Gpt-neox-20b: An open-source autoregressive language model, 2022."},{"key":"e_1_3_2_1_7_1","volume-title":"Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, et al. Language models are few-shot learners. Advances in neural information processing systems, 33:1877--1901, 2020."},{"key":"e_1_3_2_1_8_1","volume-title":"Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020","author":"Brown Tom B.","year":"2020","unstructured":"Tom B. Brown, Benjamin Mann, Nick Ryder, et al. Language models are few-shot learners. In Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual, 2020."},{"key":"e_1_3_2_1_9_1","first-page":"578","volume-title":"13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018","author":"Chen Tianqi","year":"2018","unstructured":"Tianqi Chen, Thierry Moreau, Ziheng Jiang, Lianmin Zheng, Eddie Q. Yan, Haichen Shen, Meghan Cowan, Leyuan Wang, Yuwei Hu, Luis Ceze, Carlos Guestrin, and Arvind Krishnamurthy. TVM: an automated end-to-end optimizing compiler for deep learning. In 13th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2018, Carlsbad, CA, USA, October 8-10, 2018, pages 578--594. USENIX Association, 2018."},{"key":"e_1_3_2_1_10_1","volume-title":"March","author":"Chiang Wei-Lin","year":"2023","unstructured":"Wei-Lin Chiang, Zhuohan Li, Zi Lin, Ying Sheng, Zhanghao Wu, Hao Zhang, Lianmin Zheng, Siyuan Zhuang, Yonghao Zhuang, Joseph E. Gonzalez, Ion Stoica, and Eric P. Xing. Vicuna: An open-source chatbot impressing gpt-4 with 90%* chatgpt quality, March 2023."},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA51647.2021.00049"},{"key":"e_1_3_2_1_12_1","doi-asserted-by":"publisher","DOI":"10.1109\/HCS49909.2020.9220622"},{"key":"e_1_3_2_1_13_1","volume-title":"Flashattention-2: Faster attention with better parallelism and work partitioning. CoRR, abs\/2307.08691","author":"Dao Tri","year":"2023","unstructured":"Tri Dao. Flashattention-2: Faster attention with better parallelism and work partitioning. CoRR, abs\/2307.08691, 2023."},{"key":"e_1_3_2_1_14_1","volume-title":"NeurIPS","author":"Dao Tri","year":"2022","unstructured":"Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, and Christopher R\u00e9. Flashattention: Fast and memory-efficient exact attention with io-awareness. In NeurIPS, 2022."},{"key":"e_1_3_2_1_15_1","volume-title":"Zero documentation","year":"2023","unstructured":"DeepSpeed. Zero documentation, 2023."},{"key":"e_1_3_2_1_16_1","volume-title":"Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019","volume":"1","author":"Devlin Jacob","year":"2019","unstructured":"Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. BERT: pre-training of deep bidirectional transformers for language understanding. In Jill Burstein, Christy Doran, and Thamar Solorio, editors, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, NAACL-HLT 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long and Short Papers), pages 4171--4186. Association for Computational Linguistics, 2019."},{"key":"e_1_3_2_1_17_1","volume-title":"Glm: General language model pretraining with autoregressive blank infilling. arXiv preprint arXiv:2103.10360","author":"Du Zhengxiao","year":"2021","unstructured":"Zhengxiao Du, Yujie Qian, Xiao Liu, Ming Ding, Jiezhong Qiu, Zhilin Yang, and Jie Tang. Glm: General language model pretraining with autoregressive blank infilling. arXiv preprint arXiv:2103.10360, 2021."},{"key":"e_1_3_2_1_18_1","first-page":"369","volume-title":"Ottawa Linux Symposium","volume":"1","author":"Gorman Mel","year":"2006","unstructured":"Mel Gorman and Andy Whitcroft. The what, the why and the where to of anti-fragmentation. In Ottawa Linux Symposium, volume 1, pages 369--384. Citeseer, 2006."},{"key":"e_1_3_2_1_19_1","volume-title":"How far does bert look at: Distance-based clustering and analysis of bert's attention. arXiv preprint arXiv:2011.00943","author":"Guan Yue","year":"2020","unstructured":"Yue Guan, Jingwen Leng, Chao Li, Quan Chen, and Minyi Guo. How far does bert look at: Distance-based clustering and analysis of bert's attention. arXiv preprint arXiv:2011.00943, 2020."},{"key":"e_1_3_2_1_20_1","first-page":"7275","volume-title":"Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2022","author":"Guan Yue","year":"2022","unstructured":"Yue Guan, Zhengyi Li, Jingwen Leng, Zhouhan Lin, and Minyi Guo. Transkimmer: Transformer learns to layer-wise skim. In Smaranda Muresan, Preslav Nakov, and Aline Villavicencio, editors, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2022, Dublin, Ireland, May 22-27, 2022, pages 7275--7286. Association for Computational Linguistics, 2022."},{"key":"e_1_3_2_1_21_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433722"},{"key":"e_1_3_2_1_22_1","volume-title":"International Conference on Learning Representations","author":"Guo Cong","year":"2022","unstructured":"Cong Guo, Yuxian Qiu, Jingwen Leng, Xiaotian Gao, Chen Zhang, Yunxin Liu, Fan Yang, Yuhao Zhu, and Minyi Guo. SQuant: On-the-fly data-free quantization via diagonal hessian approximation. In International Conference on Learning Representations, 2022."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICCD56317.2022.00113"},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1145\/3579371.3589038"},{"key":"e_1_3_2_1_25_1","doi-asserted-by":"publisher","DOI":"10.1109\/MICRO56248.2022.00095"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.1109\/DAC18072.2020.9218732"},{"key":"e_1_3_2_1_27_1","volume-title":"Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149","author":"Han Song","year":"2015","unstructured":"Song Han, Huizi Mao, and William J Dally. Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. arXiv preprint arXiv:1510.00149, 2015."},{"key":"e_1_3_2_1_28_1","volume-title":"Pipedream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377","author":"Harlap Aaron","year":"2018","unstructured":"Aaron Harlap, Deepak Narayanan, Amar Phanishayee, Vivek Seshadri, Nikhil Devanur, Greg Ganger, and Phil Gibbons. Pipedream: Fast and Efficient Pipeline Parallel DNN Training. arXiv:1806.03377, 2018."},{"key":"e_1_3_2_1_29_1","volume-title":"Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685","author":"Hu Edward J","year":"2021","unstructured":"Edward J Hu, Yelong Shen, Phillip Wallis, Zeyuan Allen-Zhu, Yuanzhi Li, Shean Wang, Lu Wang, and Weizhu Chen. Lora: Low-rank adaptation of large language models. arXiv preprint arXiv:2106.09685, 2021."},{"key":"e_1_3_2_1_30_1","volume-title":"et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc V Le, et al. Gpipe: Efficient training of giant neural networks using pipeline parallelism. Advances in neural information processing systems, 2019."},{"key":"e_1_3_2_1_31_1","volume-title":"Proceedings of the ACM Java Grande Conference","author":"Hudson Richard L.","year":"2001","unstructured":"Richard L. Hudson and J. Eliot B. Moss. Sapphire: copying GC without stopping the world. In Denis Caromel, John Reynders, and Michael Philippsen, editors, Proceedings of the ACM Java Grande Conference, Stanford University. ACM, 2001."},{"key":"e_1_3_2_1_32_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00286"},{"key":"e_1_3_2_1_33_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.nlp.2023.100020"},{"key":"e_1_3_2_1_34_1","doi-asserted-by":"publisher","DOI":"10.1145\/3341301.3359630"},{"key":"e_1_3_2_1_35_1","volume-title":"International Symposium on Memory Management, ISMM '98, Vancouver, British Columbia, Canada, 17-19 October, 1998, Conference Proceedings. ACM","author":"Mark","year":"1998","unstructured":"Mark S. Johnstone and Paul R. Wilson. The memory fragmentation problem: Solved? In Simon L. Peyton Jones and Richard E. Jones, editors, International Symposium on Memory Management, ISMM '98, Vancouver, British Columbia, Canada, 17-19 October, 1998, Conference Proceedings. ACM, 1998."},{"key":"e_1_3_2_1_36_1","first-page":"012056","volume-title":"Journal of Physics: Conference Series","volume":"1740","author":"Khalilov Mikhail","unstructured":"Mikhail Khalilov and Alexey Timoveev. Performance analysis of cuda, openacc and openmp programming models on tesla v100 gpu. In Journal of Physics: Conference Series, volume 1740, page 012056. IOP Publishing, 2021."},{"key":"e_1_3_2_1_37_1","volume-title":"8th International Conference on Learning Representations, ICLR 2020","author":"Kitaev Nikita","year":"2020","unstructured":"Nikita Kitaev, Lukasz Kaiser, and Anselm Levskaya. Reformer: The efficient transformer. In 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net, 2020."},{"key":"e_1_3_2_1_38_1","first-page":"423","volume-title":"Proceedings of the 28th Asia and South Pacific Design Automation Conference, ASPDAC 2023","author":"Krauss Rune","year":"2023","unstructured":"Rune Krauss, Mehran Goli, and Rolf Drechsler. EDDY: A multi-core BDD package with dynamic memory management and reduced fragmentation. In Atsushi Takahashi, editor, Proceedings of the 28th Asia and South Pacific Design Automation Conference, ASPDAC 2023, Tokyo, Japan, January 16-19, 2023, pages 423--428. ACM, 2023."},{"key":"e_1_3_2_1_39_1","volume-title":"Ziplm: Hardware-aware structured pruning of language models. CoRR, abs\/2302.04089","author":"Kurtic Eldar","year":"2023","unstructured":"Eldar Kurtic, Elias Frantar, and Dan Alistarh. Ziplm: Hardware-aware structured pruning of language models. CoRR, abs\/2302.04089, 2023."},{"key":"e_1_3_2_1_40_1","first-page":"32","article-title":"A graph theoretic framework of recomputation algorithms for memory-efficient backpropagation","author":"Kusumoto Mitsuru","year":"2019","unstructured":"Mitsuru Kusumoto, Takuya Inoue, Gentaro Watanabe, Takuya Akiba, and Masanori Koyama. A graph theoretic framework of recomputation algorithms for memory-efficient backpropagation. Advances in Neural Information Processing Systems, 32, 2019.","journal-title":"Advances in Neural Information Processing Systems"},{"key":"e_1_3_2_1_41_1","doi-asserted-by":"publisher","DOI":"10.5555\/3026877.3026931"},{"key":"e_1_3_2_1_42_1","volume-title":"Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv:2006.16668","author":"Lepikhin Dmitry","year":"2020","unstructured":"Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen. Gshard: Scaling giant models with conditional computation and automatic sharding. arXiv:2006.16668, 2020."},{"key":"e_1_3_2_1_43_1","volume-title":"Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704","author":"Li Shen","year":"2020","unstructured":"Shen Li, Yanli Zhao, Rohan Varma, Omkar Salpekar, Pieter Noordhuis, Teng Li, Adam Paszke, Jeff Smith, Brian Vaughan, Pritam Damania, et al. Pytorch distributed: Experiences on accelerating data parallel training. arXiv preprint arXiv:2006.15704, 2020."},{"key":"e_1_3_2_1_44_1","volume-title":"Colossal-ai: A unified deep learning system for large-scale parallel training. arXiv preprint arXiv:2110.14883","author":"Li Shenggui","year":"2021","unstructured":"Shenggui Li, Jiarui Fang, Zhengda Bian, Hongxin Liu, Yuliang Liu, Haichen Huang, Boxiang Wang, and Yang You. Colossal-ai: A unified deep learning system for large-scale parallel training. arXiv preprint arXiv:2110.14883, 2021."},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476145"},{"key":"e_1_3_2_1_46_1","volume-title":"Efficient activation quantization via adaptive rounding border for post-training quantization. arXiv preprint arXiv:2208.11945","author":"Li Zhengyi","year":"2022","unstructured":"Zhengyi Li, Cong Guo, Zhanda Zhu, Yangjie Zhou, Yuxian Qiu, Xiaotian Gao, Jingwen Leng, and Minyi Guo. Efficient activation quantization via adaptive rounding border for post-training quantization. arXiv preprint arXiv:2208.11945, 2022."},{"key":"e_1_3_2_1_47_1","volume-title":"AWQ: activation-aware weight quantization for LLM compression and acceleration. CoRR, abs\/2306.00978","author":"Lin Ji","year":"2023","unstructured":"Ji Lin, Jiaming Tang, Haotian Tang, Shang Yang, Xingyu Dang, and Song Han. AWQ: activation-aware weight quantization for LLM compression and acceleration. CoRR, abs\/2306.00978, 2023."},{"key":"e_1_3_2_1_48_1","unstructured":"Linux man-pages project. mmap(2) --- Linux manual page."},{"key":"e_1_3_2_1_49_1","volume-title":"Summary of chatgpt\/gpt-4 research and perspective towards the future of large language models","author":"Liu Yiheng","year":"2023","unstructured":"Yiheng Liu, Tianle Han, Siyuan Ma, Jiayue Zhang, Yuanyuan Yang, Jiaming Tian, Hao He, Antong Li, Mengshen He, Zhengliang Liu, Zihao Wu, Dajiang Zhu, Xiang Li, Ning Qiang, Dingang Shen, Tianming Liu, and Bao Ge. Summary of chatgpt\/gpt-4 research and perspective towards the future of large language models, 2023."},{"key":"e_1_3_2_1_50_1","doi-asserted-by":"publisher","DOI":"10.1145\/3503222.3507752"},{"key":"e_1_3_2_1_51_1","doi-asserted-by":"publisher","DOI":"10.1145\/2749469.2749475"},{"key":"e_1_3_2_1_52_1","volume-title":"Advances in Neural Information Processing Systems","author":"Ma Xinyin","year":"2023","unstructured":"Xinyin Ma, Gongfan Fang, and Xinchao Wang. Llm-pruner: On the structural pruning of large language models. In Advances in Neural Information Processing Systems, 2023."},{"key":"e_1_3_2_1_53_1","volume-title":"Proceedings of the 7th International Symposium on Memory Management, ISMM 2008","author":"Marlow Simon","year":"2008","unstructured":"Simon Marlow, Tim Harris, Roshan P. James, and Simon L. Peyton Jones. Parallel generational-copying garbage collection with a block-structured heap. In Richard E. Jones and Stephen M. Blackburn, editors, Proceedings of the 7th International Symposium on Memory Management, ISMM 2008, Tucson, AZ, USA, June 7-8, 2008. ACM, 2008."},{"key":"e_1_3_2_1_54_1","volume-title":"IEEE\/ACM International Symposium on Microarchitecture (MICRO)","author":"Mars Jason","year":"2011","unstructured":"Jason Mars, Lingjia Tang, Robert Hundt, Kevin Skadron, and Mary Lou Soffa. Bubble-up: increasing utilization in modern warehouse scale computers via sensible co-locations. In IEEE\/ACM International Symposium on Microarchitecture (MICRO), 2011."},{"key":"e_1_3_2_1_55_1","volume-title":"An exact approach to the strip-packing problem. INFORMS journal on Computing, 15(3):310--319","author":"Martello Silvano","year":"2003","unstructured":"Silvano Martello, Michele Monaci, and Daniele Vigo. An exact approach to the strip-packing problem. INFORMS journal on Computing, 15(3):310--319, 2003."},{"key":"e_1_3_2_1_56_1","volume-title":"Virtualalloc function (memoryapi.h), 7","year":"2022","unstructured":"Microsoft. Virtualalloc function (memoryapi.h), 7 2022."},{"key":"e_1_3_2_1_57_1","volume-title":"ACM Computing Surveys","author":"Min Bonan","year":"2021","unstructured":"Bonan Min, Hayley Ross, Elior Sulem, Amir Pouran Ben Veyseh, Thien Huu Nguyen, Oscar Sainz, Eneko Agirre, Ilana Heintz, and Dan Roth. Recent advances in natural language processing via large pre-trained language models: A survey. ACM Computing Surveys, 2021."},{"key":"e_1_3_2_1_58_1","volume-title":"GPT-4 technical report. CoRR, abs\/2303.08774","author":"AI.","year":"2023","unstructured":"OpenAI. GPT-4 technical report. CoRR, abs\/2303.08774, 2023."},{"key":"e_1_3_2_1_59_1","volume-title":"The z garbage collector. https:\/\/github.com\/openjdk\/zgc","year":"2023","unstructured":"openjdk. The z garbage collector. https:\/\/github.com\/openjdk\/zgc, 2023."},{"key":"e_1_3_2_1_60_1","volume-title":"32nd Euromicro conference on real-time systems (ECRTS 2020","author":"Otterness Nathan","year":"2020","unstructured":"Nathan Otterness and James H Anderson. Amd gpus as an alternative to nvidia for supporting real-time workloads. In 32nd Euromicro conference on real-time systems (ECRTS 2020). Schloss Dagstuhl-Leibniz-Zentrum f\u00fcr Informatik, 2020."},{"key":"e_1_3_2_1_61_1","volume-title":"et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, et al. Pytorch: An imperative style, high-performance deep learning library. Advances in neural information processing systems, 2019."},{"key":"e_1_3_2_1_62_1","volume-title":"Introducing low-level gpu virtual memory management, 4","author":"Perry Cory","year":"2020","unstructured":"Cory Perry and Nikolay Sakharnykh. Introducing low-level gpu virtual memory management, 4 2020."},{"key":"e_1_3_2_1_63_1","unstructured":"Automatic Differentiation In Pytorch. Pytorch 2018."},{"key":"e_1_3_2_1_64_1","doi-asserted-by":"publisher","DOI":"10.1109\/HPCA47549.2020.00015"},{"key":"e_1_3_2_1_65_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00491"},{"issue":"8","key":"e_1_3_2_1_66_1","first-page":"9","article-title":"Language models are unsupervised multitask learners","volume":"1","author":"Radford Alec","year":"2019","unstructured":"Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei, Ilya Sutskever, et al. Language models are unsupervised multitask learners. OpenAI blog, 1(8):9, 2019.","journal-title":"OpenAI blog"},{"key":"e_1_3_2_1_67_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_68_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_69_1","volume-title":"2021 USENIX Annual Technical Conference (ATC)","author":"Ren Jie","year":"2021","unstructured":"Jie Ren, Samyam Rajbhandari, Reza Yazdani Aminabadi, Olatunji Ruwase, Shuangyan Yang, Minjia Zhang, Dong Li, and Yuxiong He. ZeRO-Offload: Democratizing Billion-Scale model training. In 2021 USENIX Annual Technical Conference (ATC), 2021."},{"key":"e_1_3_2_1_70_1","volume-title":"Gradient checkpointing","author":"Salimans Tim","year":"2017","unstructured":"Tim Salimans and Yaroslav Bulatov. Gradient checkpointing, 2017."},{"key":"e_1_3_2_1_71_1","volume-title":"High-throughput generative inference of large language models with a single GPU. CoRR, abs\/2303.06865","author":"Sheng Ying","year":"2023","unstructured":"Ying Sheng, Lianmin Zheng, Binhang Yuan, Zhuohan Li, Max Ryabinin, Daniel Y. Fu, Zhiqiang Xie, Beidi Chen, Clark W. Barrett, Joseph E. Gonzalez, Percy Liang, Christopher R\u00e9, Ion Stoica, and Ce Zhang. High-throughput generative inference of large language models with a single GPU. CoRR, abs\/2303.06865, 2023."},{"key":"e_1_3_2_1_72_1","volume-title":"Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053","author":"Shoeybi Mohammad","year":"2019","unstructured":"Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper, and Bryan Catanzaro. Megatron-lm: Training multi-billion parameter language models using model parallelism. arXiv preprint arXiv:1909.08053, 2019."},{"key":"e_1_3_2_1_73_1","doi-asserted-by":"publisher","DOI":"10.1145\/354880.354883"},{"key":"e_1_3_2_1_74_1","first-page":"52","volume-title":"Proceedings of the 5th International Symposium on Memory Management, ISMM 2006","author":"Siegwart David","year":"2006","unstructured":"David Siegwart and Martin Hirzel. Improving locality with parallel hierarchical copying GC. In Erez Petrank and J. Eliot B. Moss, editors, Proceedings of the 5th International Symposium on Memory Management, ISMM 2006, Ottawa, Ontario, Canada, June 10-11, 2006, pages 52--63. ACM, 2006."},{"key":"e_1_3_2_1_75_1","doi-asserted-by":"publisher","DOI":"10.1109\/TBDATA.2019.2957478"},{"key":"e_1_3_2_1_76_1","volume-title":"Bfc allocator","year":"2022","unstructured":"TensorFlow. Bfc allocator, 2022."},{"key":"e_1_3_2_1_77_1","volume-title":"Llama: Open and efficient foundation language models","author":"Touvron Hugo","year":"2023","unstructured":"Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timoth\u00e9e Lacroix, Baptiste Rozi\u00e8re, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, and Guillaume Lample. Llama: Open and efficient foundation language models, 2023."},{"key":"e_1_3_2_1_78_1","volume-title":"Attention is all you need","author":"Vaswani Ashish","year":"2023","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, and Illia Polosukhin. Attention is all you need, 2023."},{"key":"e_1_3_2_1_79_1","doi-asserted-by":"publisher","DOI":"10.1145\/2247684.2247693"},{"key":"e_1_3_2_1_80_1","doi-asserted-by":"publisher","DOI":"10.1109\/ISCA52012.2021.00088"},{"key":"e_1_3_2_1_81_1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00066"},{"key":"e_1_3_2_1_82_1","volume-title":"vllm: Easy, fast, and cheap llm serving with pagedattention. https:\/\/vllm.ai\/","author":"Woosuk Kwon","year":"2023","unstructured":"Kwon Woosuk, Li Zhuohan, Zhuang Siyuan, Sheng Ying, Zheng Lianmin, Yu Cody, Gonzalez Joey, Zhang Hao, and Stoica Ion. vllm: Easy, fast, and cheap llm serving with pagedattention. https:\/\/vllm.ai\/, 2023."},{"key":"e_1_3_2_1_83_1","doi-asserted-by":"publisher","DOI":"10.1109\/IPDPS54959.2023.00031"},{"key":"e_1_3_2_1_84_1","doi-asserted-by":"publisher","DOI":"10.1145\/3470496.3527406"},{"key":"e_1_3_2_1_85_1","doi-asserted-by":"publisher","DOI":"10.1145\/2485922.2485974"},{"key":"e_1_3_2_1_86_1","volume-title":"Xiexing Feng, and Thangarajah Akilan. Recomputation of the dense layers for performance improvement of dcnn","author":"Yang Yimin","year":"2019","unstructured":"Yimin Yang, QM Jonathan Wu, Xiexing Feng, and Thangarajah Akilan. Recomputation of the dense layers for performance improvement of dcnn. IEEE transactions on pattern analysis and machine intelligence, 42(11):2912--2925, 2019."},{"key":"e_1_3_2_1_87_1","volume-title":"Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, et al. Opt: Open pre-trained transformer language models","author":"Zhang Susan","year":"2022","unstructured":"Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen, Christopher Dewan, Mona Diab, Xian Li, Xi Victoria Lin, Todor Mihaylov, Myle Ott, Sam Shleifer, Kurt Shuster, et al. Opt: Open pre-trained transformer language models, 2022."},{"key":"e_1_3_2_1_88_1","volume-title":"A survey of large language models. arXiv preprint arXiv:2303.18223","author":"Zhao Wayne Xin","year":"2023","unstructured":"Wayne Xin Zhao, Kun Zhou, Junyi Li, Tianyi Tang, Xiaolei Wang, Yupeng Hou, Yingqian Min, Beichen Zhang, Junjie Zhang, Zican Dong, et al. A survey of large language models. arXiv preprint arXiv:2303.18223, 2023."},{"key":"e_1_3_2_1_89_1","volume-title":"Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277","author":"Zhao Yanli","year":"2023","unstructured":"Yanli Zhao, Andrew Gu, Rohan Varma, Liang Luo, Chien-Chin Huang, Min Xu, Less Wright, Hamid Shojanazeri, Myle Ott, Sam Shleifer, et al. Pytorch fsdp: experiences on scaling fully sharded data parallel. arXiv preprint arXiv:2304.11277, 2023."},{"key":"e_1_3_2_1_90_1","volume-title":"Ion Stoica. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In Symposium on Operating Systems Design and Implementation (OSDI)","author":"Zheng Lianmin","year":"2020","unstructured":"Lianmin Zheng, Chengfan Jia, Minmin Sun, Zhao Wu, Cody Hao Yu, Ameer Haj-Ali, Yida Wang, Jun Yang, Danyang Zhuo, Koushik Sen, Joseph E. Gonzalez, and Ion Stoica. Ansor: Generating High-Performance Tensor Programs for Deep Learning. In Symposium on Operating Systems Design and Implementation (OSDI), 2020."},{"key":"e_1_3_2_1_91_1","doi-asserted-by":"publisher","DOI":"10.1145\/3575693.3575723"},{"key":"e_1_3_2_1_92_1","first-page":"233","volume-title":"16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22)","author":"Zhu Hongyu","year":"2022","unstructured":"Hongyu Zhu, Ruofan Wu, Yijia Diao, Shanbin Ke, Haoyu Li, Chen Zhang, Jilong Xue, Lingxiao Ma, Yuqing Xia, Wei Cui, Fan Yang, Mao Yang, Lidong Zhou, Asaf Cidon, and Gennady Pekhimenko. ROLLER: Fast and efficient tensor compilation for deep learning. In 16th USENIX Symposium on Operating Systems Design and Implementation (OSDI 22), pages 233--248, 2022."},{"key":"e_1_3_2_1_93_1","doi-asserted-by":"publisher","DOI":"10.1145\/3352460.3358269"},{"key":"e_1_3_2_1_94_1","article-title":"Effective training of convolutional neural networks with low-bitwidth weights and activations","author":"Zhuang Bohan","year":"2021","unstructured":"Bohan Zhuang, Mingkui Tan, Jing Liu, Lingqiao Liu, Ian Reid, and Chunhua Shen. Effective training of convolutional neural networks with low-bitwidth weights and activations. IEEE Transactions on Pattern Analysis and Machine Intelligence, 2021.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"}],"event":{"name":"ASPLOS '24: 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2","location":"La Jolla CA USA","acronym":"ASPLOS '24","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture","SIGOPS ACM Special Interest Group on Operating Systems","SIGPLAN ACM Special Interest Group on Programming Languages","SIGBED ACM Special Interest Group on Embedded Systems"]},"container-title":["Proceedings of the 29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640423","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3620665.3640423","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,19]],"date-time":"2025-06-19T00:03:42Z","timestamp":1750291422000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3620665.3640423"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,4,27]]},"references-count":94,"alternative-id":["10.1145\/3620665.3640423","10.1145\/3620665"],"URL":"https:\/\/doi.org\/10.1145\/3620665.3640423","relation":{},"subject":[],"published":{"date-parts":[[2024,4,27]]},"assertion":[{"value":"2024-04-27","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}