{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:40:04Z","timestamp":1755877204030,"version":"3.44.0"},"publisher-location":"New York, NY, USA","reference-count":59,"publisher":"ACM","license":[{"start":{"date-parts":[[2024,5,30]],"date-time":"2024-05-30T00:00:00Z","timestamp":1717027200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2024,5,30]]},"DOI":"10.1145\/3650200.3656619","type":"proceedings-article","created":{"date-parts":[[2024,6,3]],"date-time":"2024-06-03T14:11:54Z","timestamp":1717423914000},"page":"485-497","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Sylva: Sparse Embedded Adapters via Hierarchical Approximate Second-Order Information"],"prefix":"10.1145","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5447-205X","authenticated-orcid":false,"given":"Baorun","family":"Mu","sequence":"first","affiliation":[{"name":"University of Toronto, Vector Institute, CentML, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-0162-4547","authenticated-orcid":false,"given":"Christina","family":"Giannoula","sequence":"additional","affiliation":[{"name":"University of Toronto, CentML, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5150-0353","authenticated-orcid":false,"given":"Shang","family":"Wang","sequence":"additional","affiliation":[{"name":"University of Toronto, Vector Institute, CentML, Canada"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-3839-0919","authenticated-orcid":false,"given":"Gennady","family":"Pekhimenko","sequence":"additional","affiliation":[{"name":"University of Toronto, Vector Institute, CentML, Canada"}]}],"member":"320","published-online":{"date-parts":[[2024,6,3]]},"reference":[{"key":"e_1_3_2_1_1_1","doi-asserted-by":"publisher","DOI":"10.1162\/089976698300017746"},{"key":"e_1_3_2_1_2_1","unstructured":"Fengxiang Bie Yibo Yang Zhongzhu Zhou Adam Ghanem Minjia Zhang Zhewei Yao Xiaoxia Wu Connor Holmes Pareesa Golnari David\u00a0A. Clifton Yuxiong He Dacheng Tao and Shuaiwen\u00a0Leon Song. 2023. RenAIssance: A Survey into AI Text-to-Image Generation in the Era of Large Model. arxiv:2309.00810\u00a0[cs.CV]"},{"key":"e_1_3_2_1_3_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.laa.2005.07.021"},{"key":"e_1_3_2_1_4_1","volume-title":"Inc.","author":"Brown Tom","year":"2020","unstructured":"Tom Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared\u00a0D Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel Ziegler, Jeffrey Wu, Clemens Winter, Chris Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, and Dario Amodei. 2020. Language Models are Few-Shot Learners. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates, Inc., Vancouver, Canada, 1877\u20131901."},{"key":"e_1_3_2_1_5_1","volume-title":"VENOM: A Vectorized N:M Format for Unleashing the Power of Sparse Tensor Cores. arxiv:2310.02065\u00a0[cs.DC]","author":"Castro L.","year":"2023","unstructured":"Roberto\u00a0L. Castro, Andrei Ivanov, Diego Andrade, Tal Ben-Nun, Basilio\u00a0B. Fraguela, and Torsten Hoefler. 2023. VENOM: A Vectorized N:M Format for Unleashing the Power of Sparse Tensor Cores. arxiv:2310.02065\u00a0[cs.DC]"},{"key":"e_1_3_2_1_6_1","unstructured":"Mark Chen Jerry Tworek Heewoo Jun Qiming Yuan Henrique\u00a0Ponde de Oliveira\u00a0Pinto Jared Kaplan Harri Edwards Yuri Burda Nicholas Joseph Greg Brockman Alex Ray Raul Puri Gretchen Krueger Michael Petrov Heidy Khlaaf Girish Sastry Pamela Mishkin Brooke Chan Scott Gray Nick Ryder Mikhail Pavlov Alethea Power Lukasz Kaiser Mohammad Bavarian Clemens Winter Philippe Tillet Felipe\u00a0Petroski Such Dave Cummings Matthias Plappert Fotios Chantzis Elizabeth Barnes Ariel Herbert-Voss William\u00a0Hebgen Guss Alex Nichol Alex Paino Nikolas Tezak Jie Tang Igor Babuschkin Suchir Balaji Shantanu Jain William Saunders Christopher Hesse Andrew\u00a0N. Carr Jan Leike Josh Achiam Vedant Misra Evan Morikawa Alec Radford Matthew Knight Miles Brundage Mira Murati Katie Mayer Peter Welinder Bob McGrew Dario Amodei Sam McCandlish Ilya Sutskever and Wojciech Zaremba. 2021. Evaluating Large Language Models Trained on Code. arxiv:2107.03374\u00a0[cs.LG]"},{"key":"e_1_3_2_1_7_1","unstructured":"Tianqi Chen Bing Xu Chiyuan Zhang and Carlos Guestrin. 2016. Training Deep Nets with Sublinear Memory Cost. arxiv:1604.06174\u00a0[cs.LG]"},{"volume-title":"Advances in Neural Information Processing Systems, A.\u00a0Oh, T.\u00a0Neumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.). Vol.\u00a036. Curran Associates","author":"Dettmers Tim","key":"e_1_3_2_1_8_1","unstructured":"Tim Dettmers, Artidoro Pagnoni, Ari Holtzman, and Luke Zettlemoyer. 2023. QLoRA: Efficient Finetuning of Quantized LLMs. In Advances in Neural Information Processing Systems, A.\u00a0Oh, T.\u00a0Neumann, A.\u00a0Globerson, K.\u00a0Saenko, M.\u00a0Hardt, and S.\u00a0Levine (Eds.). Vol.\u00a036. Curran Associates, Inc., New Orleans, USA, 10088\u201310115."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/N19-1423"},{"key":"e_1_3_2_1_10_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2023.emnlp-main.252"},{"key":"e_1_3_2_1_11_1","doi-asserted-by":"publisher","DOI":"10.1007\/BF02288367"},{"volume-title":"Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates","author":"Frantar Elias","key":"e_1_3_2_1_12_1","unstructured":"Elias Frantar and Dan Alistarh. 2022. Optimal Brain Compression: A Framework for Accurate Post-Training Quantization and Pruning. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates, Inc., New Orleans, USA, 4475\u20134488."},{"key":"e_1_3_2_1_13_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202)","author":"Frantar Elias","year":"2023","unstructured":"Elias Frantar and Dan Alistarh. 2023. SparseGPT: Massive Language Models Can be Accurately Pruned in One-Shot. In Proceedings of the 40th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a0202), Andreas Krause, Emma Brunskill, Kyunghyun Cho, Barbara Engelhardt, Sivan Sabato, and Jonathan Scarlett (Eds.). PMLR, Honolulu Hawaii USA, 10323\u201310337."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.5555\/3433701.3433723"},{"key":"e_1_3_2_1_15_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097)","author":"Ghorbani Behrooz","year":"2019","unstructured":"Behrooz Ghorbani, Shankar Krishnan, and Ying Xiao. 2019. An Investigation into Neural Net Optimization via Hessian Eigenvalue Density. In Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, Long Beach, USA, 2232\u20132241."},{"key":"e_1_3_2_1_16_1","unstructured":"Erin Griffith. 2023. The Desperate Hunt for the A.I. Boom\u2019s Most Indispensable Prize. The New York Times. https:\/\/www.nytimes.com\/2023\/08\/16\/technology\/ai-gpu-chips-shortage.html"},{"key":"e_1_3_2_1_17_1","unstructured":"Roger Grosse and James Martens. 2016. A Kronecker-factored approximate Fisher matrix for convolution layers. arxiv:1602.01407\u00a0[stat.ML]"},{"key":"e_1_3_2_1_18_1","unstructured":"Babak Hassibi David Stork and Gregory Wolff. 1993. Optimal Brain Surgeon: Extensions and performance comparisons. In Advances in Neural Information Processing Systems J.\u00a0Cowan G.\u00a0Tesauro and J.\u00a0Alspector (Eds.). Vol.\u00a06. Morgan-Kaufmann Denver USA."},{"key":"e_1_3_2_1_19_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097)","author":"Houlsby Neil","year":"2019","unstructured":"Neil Houlsby, Andrei Giurgiu, Stanislaw Jastrzebski, Bruna Morrone, Quentin De\u00a0Laroussilhe, Andrea Gesmundo, Mona Attariyan, and Sylvain Gelly. 2019. Parameter-Efficient Transfer Learning for NLP. In Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, Long Beach, USA, 2790\u20132799."},{"key":"e_1_3_2_1_20_1","unstructured":"Edward\u00a0J. Hu Yelong Shen Phillip Wallis Zeyuan Allen-Zhu Yuanzhi Li Shean Wang Lu Wang and Weizhu Chen. 2021. LoRA: Low-Rank Adaptation of Large Language Models. arxiv:2106.09685\u00a0[cs.CL]"},{"key":"e_1_3_2_1_21_1","volume-title":"Advances in Neural Information Processing Systems, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates","author":"Huang Yanping","year":"2019","unstructured":"Yanping Huang, Youlong Cheng, Ankur Bapna, Orhan Firat, Dehao Chen, Mia Chen, HyoukJoong Lee, Jiquan Ngiam, Quoc\u00a0V Le, Yonghui Wu, and zhifeng Chen. 2019. GPipe: Efficient Training of Giant Neural Networks using Pipeline Parallelism. In Advances in Neural Information Processing Systems, H.\u00a0Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d'Alch\u00e9-Buc, E.\u00a0Fox, and R.\u00a0Garnett (Eds.). Vol.\u00a032. Curran Associates, Inc., Vancouver, Canada. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2019\/file\/093f65e080a295f8076b1c5722a46aa2-Paper.pdf"},{"key":"e_1_3_2_1_22_1","volume-title":"Proceedings of the Twenty-Second International Conference on Artificial Intelligence and Statistics(Proceedings of Machine Learning Research, Vol.\u00a089)","author":"Karakida Ryo","year":"2019","unstructured":"Ryo Karakida, Shotaro Akaho, and Shun-ichi Amari. 2019. Universal Statistics of Fisher Information in Deep Neural Networks: Mean Field Approach. In Proceedings of the Twenty-Second International Conference on Artificial Intelligence and Statistics(Proceedings of Machine Learning Research, Vol.\u00a089), Kamalika Chaudhuri and Masashi Sugiyama (Eds.). PMLR, Naha, Okinawa, Japan, 1032\u20131041."},{"key":"e_1_3_2_1_23_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.279"},{"key":"e_1_3_2_1_24_1","volume-title":"Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates","author":"Kwon Woosuk","year":"2022","unstructured":"Woosuk Kwon, Sehoon Kim, Michael\u00a0W Mahoney, Joseph Hassoun, Kurt Keutzer, and Amir Gholami. 2022. A Fast Post-Training Pruning Framework for Transformers. In Advances in Neural Information Processing Systems, S.\u00a0Koyejo, S.\u00a0Mohamed, A.\u00a0Agarwal, D.\u00a0Belgrave, K.\u00a0Cho, and A.\u00a0Oh (Eds.). Vol.\u00a035. Curran Associates, Inc., New Orleans, 24101\u201324116. https:\/\/proceedings.neurips.cc\/paper_files\/paper\/2022\/file\/987bed997ab668f91c822a09bce3ea12-Paper-Conference.pdf"},{"key":"e_1_3_2_1_25_1","unstructured":"Andreas K\u00f6pf Yannic Kilcher Dimitri von R\u00fctte Sotiris Anagnostidis Zhi-Rui Tam Keith Stevens Abdullah Barhoum Nguyen\u00a0Minh Duc Oliver Stanley Rich\u00e1rd Nagyfi Shahul ES Sameer Suri David Glushkov Arnav Dantuluri Andrew Maguire Christoph Schuhmann Huu Nguyen and Alexander Mattick. 2023. OpenAssistant Conversations \u2013 Democratizing Large Language Model Alignment. arxiv:2304.07327\u00a0[cs.CL]"},{"key":"e_1_3_2_1_26_1","doi-asserted-by":"publisher","DOI":"10.5555\/2969830.2969903"},{"key":"e_1_3_2_1_27_1","doi-asserted-by":"publisher","DOI":"10.1126\/science.abq1158"},{"key":"e_1_3_2_1_28_1","volume-title":"Proceedings of the 40th International Conference on Machine Learning(ICML\u201923)","author":"Li Yixiao","year":"2023","unstructured":"Yixiao Li, Yifan Yu, Qingru Zhang, Chen Liang, Pengcheng He, Weizhu Chen, and Tuo Zhao. 2023. LoSparse: structured compression of large language models based on low-rank and sparse approximation. In Proceedings of the 40th International Conference on Machine Learning(ICML\u201923). JMLR.org, Honolulu, Hawaii, USa, Article 839, 15\u00a0pages."},{"key":"e_1_3_2_1_29_1","volume-title":"Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates","author":"Liao Zhenyu","year":"2010","unstructured":"Zhenyu Liao and Michael\u00a0W Mahoney. 2021. Hessian Eigenspectra of More Realistic Nonlinear Models. In Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates, Inc., Virtual, 20104\u201320117."},{"key":"e_1_3_2_1_30_1","volume-title":"Efficient GPU Kernels for N:M-SPARSE Weights in Deep Learning. In Sixth Conference on Machine Learning and Systems (MLSys\u201923)","author":"Lin Bin","year":"2023","unstructured":"Bin Lin, Ningxin Zheng, Lei Wang, Shijie Cao, Lingxiao Ma, Quanlu Zhang, Yi Zhu, Ting Cao, Jilong Xue, Yuqing Yang, and Fan Yang. 2023. Efficient GPU Kernels for N:M-SPARSE Weights in Deep Learning. In Sixth Conference on Machine Learning and Systems (MLSys\u201923). Machine Learning and Systems, Miami, USA."},{"key":"e_1_3_2_1_31_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.findings-emnlp.41"},{"key":"e_1_3_2_1_32_1","unstructured":"Yinhan Liu Myle Ott Naman Goyal Jingfei Du Mandar Joshi Danqi Chen Omer Levy Mike Lewis Luke Zettlemoyer and Veselin Stoyanov. 2020. Ro{BERT}a: A Robustly Optimized {BERT} Pretraining Approach."},{"key":"e_1_3_2_1_33_1","first-page":"1","article-title":"New Insights and Perspectives on the Natural Gradient Method","volume":"21","author":"Martens James","year":"2020","unstructured":"James Martens. 2020. New Insights and Perspectives on the Natural Gradient Method. Journal of Machine Learning Research 21, 146 (2020), 1\u201376.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_34_1","volume-title":"Proceedings of the 32nd International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a037)","author":"Martens James","year":"2015","unstructured":"James Martens and Roger Grosse. 2015. Optimizing Neural Networks with Kronecker-factored Approximate Curvature. In Proceedings of the 32nd International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a037), Francis Bach and David Blei (Eds.). PMLR, Lille, France, 2408\u20132417."},{"key":"e_1_3_2_1_35_1","unstructured":"Nvidia Mellanox. 2014. Introducing EDR 100Gb\/s - Enabling the Use of Data. https:\/\/network.nvidia.com\/pdf\/whitepapers\/wp_introducing_edr_100gb_enabling_use_data.pdf"},{"key":"e_1_3_2_1_36_1","doi-asserted-by":"publisher","DOI":"10.1093\/qmath\/11.1.50"},{"key":"e_1_3_2_1_37_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476209"},{"key":"e_1_3_2_1_38_1","volume-title":"Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc.","author":"Paszke Adam","year":"2019","unstructured":"Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas K\u00f6pf, Edward Yang, Zach DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. 2019. PyTorch: An Imperative Style, High-Performance Deep Learning Library. In Proceedings of the 33rd International Conference on Neural Information Processing Systems. Curran Associates Inc., Red Hook, NY, USA, Article 721, 12\u00a0pages."},{"key":"e_1_3_2_1_39_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W18-6557"},{"key":"e_1_3_2_1_40_1","unstructured":"Alec Radford Jeff Wu Rewon Child David Luan Dario Amodei and Ilya Sutskever. 2019. Language Models are Unsupervised Multitask Learners. Open AI."},{"key":"e_1_3_2_1_41_1","first-page":"1","article-title":"Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer","volume":"21","author":"Raffel Colin","year":"2020","unstructured":"Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, and Peter\u00a0J. Liu. 2020. Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer. Journal of Machine Learning Research 21, 140 (2020), 1\u201367.","journal-title":"Journal of Machine Learning Research"},{"key":"e_1_3_2_1_42_1","doi-asserted-by":"crossref","unstructured":"Samyam Rajbhandari Jeff Rasley Olatunji Ruwase and Yuxiong He. 2020. ZeRO: Memory Optimizations Toward Training Trillion Parameter Models. ArXiv.","DOI":"10.1109\/SC41405.2020.00024"},{"key":"e_1_3_2_1_43_1","doi-asserted-by":"publisher","DOI":"10.1145\/3458817.3476205"},{"key":"e_1_3_2_1_44_1","doi-asserted-by":"publisher","DOI":"10.1145\/3394486.3406703"},{"key":"e_1_3_2_1_45_1","doi-asserted-by":"crossref","unstructured":"Robin Rombach Andreas Blattmann Dominik Lorenz Patrick Esser and Bj\u00f6rn Ommer. 2022. High-Resolution Image Synthesis with Latent Diffusion Models. arxiv:2112.10752\u00a0[cs.CV]","DOI":"10.1109\/CVPR52688.2022.01042"},{"key":"e_1_3_2_1_46_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.626"},{"key":"e_1_3_2_1_47_1","unstructured":"Alec\u00a0Radford Scott\u00a0Gray and Diederik\u00a0P. Kingma. 2017. GPU Kernels for Block-Sparse Weights. https:\/\/cdn.openai.com\/blocksparse\/blocksparsepaper.pdf"},{"key":"e_1_3_2_1_48_1","volume-title":"TSMC: Shortage of Nvidia\u2019s AI GPUs to Persist for 1.5 Years. Tom\u2019s Hardware. https:\/\/www.tomshardware.com\/news\/tsmc-shortage-of-nvidias-ai-gpus-to-persist-for-15-years","author":"Shilov Anton","year":"2023","unstructured":"Anton Shilov. 2023. TSMC: Shortage of Nvidia\u2019s AI GPUs to Persist for 1.5 Years. Tom\u2019s Hardware. https:\/\/www.tomshardware.com\/news\/tsmc-shortage-of-nvidias-ai-gpus-to-persist-for-15-years"},{"key":"e_1_3_2_1_49_1","volume-title":"Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates","author":"Singh Sidak\u00a0Pal","year":"1809","unstructured":"Sidak\u00a0Pal Singh and Dan Alistarh. 2020. WoodFisher: Efficient Second-Order Approximation for Neural Network Compression. In Advances in Neural Information Processing Systems, H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.F. Balcan, and H.\u00a0Lin (Eds.). Vol.\u00a033. Curran Associates, Inc., Virtual, 18098\u201318109."},{"volume-title":"Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates","author":"Singh Sidak\u00a0Pal","key":"e_1_3_2_1_50_1","unstructured":"Sidak\u00a0Pal Singh, Gregor Bachmann, and Thomas Hofmann. 2021. Analytic Insights into Structure and Rank of Neural Network Hessian Maps. In Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates, Inc., Virtual, 23914\u201323927."},{"volume-title":"Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates","author":"Sung Yi-Lin","key":"e_1_3_2_1_51_1","unstructured":"Yi-Lin Sung, Varun Nair, and Colin\u00a0A Raffel. 2021. Training Neural Networks with Fixed Sparse Masks. In Advances in Neural Information Processing Systems, M.\u00a0Ranzato, A.\u00a0Beygelzimer, Y.\u00a0Dauphin, P.S. Liang, and J.\u00a0Wortman Vaughan (Eds.). Vol.\u00a034. Curran Associates, Inc., Virtual, 24193\u201324205."},{"key":"e_1_3_2_1_52_1","doi-asserted-by":"publisher","DOI":"10.1145\/3315508.3329973"},{"key":"e_1_3_2_1_53_1","unstructured":"Hugo Touvron Thibaut Lavril Gautier Izacard Xavier Martinet Marie-Anne Lachaux Timoth\u00e9e Lacroix Baptiste Rozi\u00e8re Naman Goyal Eric Hambro Faisal Azhar Aurelien Rodriguez Armand Joulin Edouard Grave and Guillaume Lample. 2023. LLaMa: Open and Efficient Foundation Language Models. arxiv:2302.13971\u00a0[cs.CL]"},{"volume-title":"Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates","author":"Vaswani Ashish","key":"e_1_3_2_1_54_1","unstructured":"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan\u00a0N Gomez, \u0141\u00a0ukasz Kaiser, and Illia Polosukhin. 2017. Attention is All you Need. In Advances in Neural Information Processing Systems, I.\u00a0Guyon, U.\u00a0Von Luxburg, S.\u00a0Bengio, H.\u00a0Wallach, R.\u00a0Fergus, S.\u00a0Vishwanathan, and R.\u00a0Garnett (Eds.). Vol.\u00a030. Curran Associates, Inc., Long Beach, USA."},{"key":"e_1_3_2_1_55_1","doi-asserted-by":"publisher","unstructured":"Alex Wang Amanpreet Singh Julian Michael Felix Hill Omer Levy and Samuel Bowman. 2018. GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding. In Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP Tal Linzen Grzegorz Chrupa\u0142a and Afra Alishahi (Eds.). Association for Computational Linguistics Brussels Belgium 353\u2013355. https:\/\/doi.org\/10.18653\/v1\/W18-5446","DOI":"10.18653\/v1\/W18-5446"},{"key":"e_1_3_2_1_56_1","volume-title":"Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097)","author":"Wang Chaoqi","year":"2019","unstructured":"Chaoqi Wang, Roger Grosse, Sanja Fidler, and Guodong Zhang. 2019. EigenDamage: Structured Pruning in the Kronecker-Factored Eigenbasis. In Proceedings of the 36th International Conference on Machine Learning(Proceedings of Machine Learning Research, Vol.\u00a097), Kamalika Chaudhuri and Ruslan Salakhutdinov (Eds.). PMLR, Long Beach, USA, 6566\u20136575."},{"key":"e_1_3_2_1_57_1","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2022.emnlp-main.340"},{"key":"e_1_3_2_1_58_1","volume-title":"Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond. CoRR abs\/2304.13712","author":"Yang Jingfeng","year":"2023","unstructured":"Jingfeng Yang, Hongye Jin, Ruixiang Tang, Xiaotian Han, Qizhang Feng, Haoming Jiang, Bing Yin, and Xia Hu. 2023. Harnessing the Power of LLMs in Practice: A Survey on ChatGPT and Beyond. CoRR abs\/2304.13712 (2023)."},{"key":"e_1_3_2_1_59_1","volume-title":"APT: Adaptive Pruning and Tuning Pretrained Language Models for Efficient Training and Inference. arxiv:2401.12200\u00a0[cs.CL]","author":"Zhao Bowen","year":"2024","unstructured":"Bowen Zhao, Hannaneh Hajishirzi, and Qingqing Cao. 2024. APT: Adaptive Pruning and Tuning Pretrained Language Models for Efficient Training and Inference. arxiv:2401.12200\u00a0[cs.CL]"}],"event":{"name":"ICS '24: 2024 International Conference on Supercomputing","sponsor":["SIGARCH ACM Special Interest Group on Computer Architecture"],"location":"Kyoto Japan","acronym":"ICS '24"},"container-title":["Proceedings of the 38th ACM International Conference on Supercomputing"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656619","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3650200.3656619","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,22]],"date-time":"2025-08-22T15:22:36Z","timestamp":1755876156000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3650200.3656619"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,5,30]]},"references-count":59,"alternative-id":["10.1145\/3650200.3656619","10.1145\/3650200"],"URL":"https:\/\/doi.org\/10.1145\/3650200.3656619","relation":{},"subject":[],"published":{"date-parts":[[2024,5,30]]},"assertion":[{"value":"2024-06-03","order":3,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}