{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T03:57:50Z","timestamp":1781150270167,"version":"3.54.1"},"reference-count":196,"publisher":"Elsevier BV","license":[{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/tdm\/userlicense\/1.0\/"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.elsevier.com\/legal\/tdmrep-license"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-017"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-012"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,9,1]],"date-time":"2026-09-01T00:00:00Z","timestamp":1788220800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-004"}],"funder":[{"DOI":"10.13039\/501100010418","name":"Institute for Information and Communications Technology Promotion","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100010418","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100003052","name":"Ministry of Trade, Industry and Energy","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100003052","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100014188","name":"Ministry of Science and ICT, South Korea","doi-asserted-by":"publisher","award":["IITP-2026-RS-2022-00156295"],"award-info":[{"award-number":["IITP-2026-RS-2022-00156295"]}],"id":[{"id":"10.13039\/501100014188","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["elsevier.com","sciencedirect.com"],"crossmark-restriction":true},"short-container-title":["Neural Networks"],"published-print":{"date-parts":[[2026,9]]},"DOI":"10.1016\/j.neunet.2026.108900","type":"journal-article","created":{"date-parts":[[2026,3,28]],"date-time":"2026-03-28T16:11:01Z","timestamp":1774714261000},"page":"108900","update-policy":"https:\/\/doi.org\/10.1016\/elsevier_cm_policy","source":"Crossref","is-referenced-by-count":1,"special_numbering":"C","title":["Towards efficient language giants: A comprehensive survey on structural optimizations and compression techniques for large language models"],"prefix":"10.1016","volume":"201","author":[{"given":"Gilhyeon","family":"Lee","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Seonggeun","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Dongjun","family":"Lee","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Kyungmin","family":"Goh","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-7962-657X","authenticated-orcid":false,"given":"Hyun","family":"Kim","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"78","reference":[{"key":"10.1016\/j.neunet.2026.108900_bib0001","unstructured":"Achiam, J., Adler, S., Agarwal, S., Ahmad, L., Akkaya, I., Aleman, F. L., Almeida, D., Altenschmidt, J., Altman, S., Anadkat, S. et al. (2023). Gpt-4 technical report. arXiv: 2303.08774."},{"key":"10.1016\/j.neunet.2026.108900_bib0002","doi-asserted-by":"crossref","first-page":"468","DOI":"10.1162\/tacl_a_00471","article-title":"Topiocqa: Open-domain conversational question answering with topic switching","volume":"10","author":"Adlakha","year":"2022","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"10.1016\/j.neunet.2026.108900_bib0003","unstructured":"Agarwal, R., Vieillard, N., Stanczyk, P., Ramos, S., Geist, M., & Bachem, O. (2023). Gkd: Generalized knowledge distillation for auto-regressive sequence models. 12, arXiv: 2306.1349."},{"key":"10.1016\/j.neunet.2026.108900_bib0004","doi-asserted-by":"crossref","unstructured":"Ainslie, J., Lee-Thorp, J., De Jong, M., Zemlyanskiy, Y., Lebr\u00f3n, F., & Sanghai, S. (2023). Gqa: Training generalized multi-query transformer models from multi-head checkpoints. arXiv: 2305.13245.","DOI":"10.18653\/v1\/2023.emnlp-main.298"},{"key":"10.1016\/j.neunet.2026.108900_bib0005","unstructured":"Almazrouei, E., Alobeidli, H., Alshamsi, A., Cappelli, A., Cojocaru, R., Debbah, M., Goffinet, \u00c9., Hesslow, D., Launay, J., Malartic, Q. et al. (2023). The falcon series of open language models. arXiv: 2311.16867."},{"key":"10.1016\/j.neunet.2026.108900_bib0006","doi-asserted-by":"crossref","unstructured":"Anantha, R., Vakulenko, S., Tu, Z., Longpre, S., Pulman, S., & Chappidi, S. (2020). Open-domain question answering goes conversational via question rewriting. arXiv: 2010.04898.","DOI":"10.18653\/v1\/2021.naacl-main.44"},{"key":"10.1016\/j.neunet.2026.108900_bib0007","unstructured":"Ashkboos, S., Croci, M. L., Nascimento, M. G. d., Hoefler, T., & Hensman, J. (2024). Slicegpt: Compress large language models by deleting rows and columns. arXiv: 2401.15024."},{"key":"10.1016\/j.neunet.2026.108900_bib0008","unstructured":"Ba, J. L., Kiros, J. R., & Hinton, G. E. (2016a). Layer normalization. arXiv: 1607.06450."},{"key":"10.1016\/j.neunet.2026.108900_bib0009","first-page":"21","article-title":"Layer normalization","volume":"1050","author":"Ba","year":"2016","journal-title":"Stat"},{"key":"10.1016\/j.neunet.2026.108900_bib0010","series-title":"Uncertainty in artificial intelligence","first-page":"1352","article-title":"Rezero is all you need: Fast convergence at large depth","author":"Bachlechner","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0011","series-title":"The thirteenth international conference on learning representations","article-title":"Relaxed recursive transformers: Effective parameter sharing with layer-wise loRA","author":"Bae","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0012","doi-asserted-by":"crossref","first-page":"117580","DOI":"10.52202\/079017-3733","article-title":"Adaptive sampling for efficient softmax approximation","volume":"37","author":"Baharav","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0013","unstructured":"Bai, G., Chai, Z., Ling, C., Wang, S., Lu, J., Zhang, N., Shi, T., Yu, Z., Zhu, M., Zhang, Y. et al. (2024a). Beyond efficiency: A systematic survey of resource-efficient large language models. arXiv: 2401.00625."},{"key":"10.1016\/j.neunet.2026.108900_bib0014","doi-asserted-by":"crossref","first-page":"46203","DOI":"10.52202\/079017-1468","article-title":"SparseLLM: Towards global pruning of pre-trained language models","volume":"37","author":"Bai","year":"2024","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0015","unstructured":"Bai, J., Bai, S., Chu, Y., Cui, Z., Dang, K., Deng, X., Fan, Y., Ge, W., Han, Y., Huang, F. et al. (2023a). Qwen technical report. arXiv: 2309.16609."},{"key":"10.1016\/j.neunet.2026.108900_bib0016","unstructured":"Bai, Y., Lv, X., Zhang, J., Lyu, H., Tang, J., Huang, Z., Du, Z., Liu, X., Zeng, A., Hou, L. et al. (2023b). Longbench: A bilingual, multitask benchmark for long context understanding. arXiv: 2308.14508."},{"key":"10.1016\/j.neunet.2026.108900_bib0017","unstructured":"Bajaj, P., Campos, D., Craswell, N., Deng, L., Gao, J., Liu, X., Majumder, R., McNamara, A., Mitra, B., Nguyen, T. et al. (2016). Ms marco: A human generated machine reading comprehension dataset. arXiv: 1611.09268."},{"key":"10.1016\/j.neunet.2026.108900_bib0018","doi-asserted-by":"crossref","unstructured":"Banerjee, P., Pal, K. K., Mitra, A., & Baral, C. (2019). Careful selection of knowledge to solve open book question answering. arXiv: 1907.10738.","DOI":"10.18653\/v1\/P19-1615"},{"key":"10.1016\/j.neunet.2026.108900_bib0019","series-title":"Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","author":"Banerjee","year":"2005"},{"key":"10.1016\/j.neunet.2026.108900_bib0020","doi-asserted-by":"crossref","unstructured":"Barros, S. (2025). Solving AI foundational model latency with telco infrastructure. arXiv: 2504.03708.","DOI":"10.2139\/ssrn.5195333"},{"key":"10.1016\/j.neunet.2026.108900_bib0021","unstructured":"Beck, M., P\u00f6ppel, K., Lippe, P., & Hochreiter, S. (2025). Tiled flash linear attention: More efficient linear RNN and xLSTM kernels. arXiv: 2503.14376."},{"key":"10.1016\/j.neunet.2026.108900_bib0022","unstructured":"Beltagy, I., Peters, M. E., & Cohan, A. (2020). Longformer: The long-document transformer. arXiv: 2004.05150."},{"key":"10.1016\/j.neunet.2026.108900_bib0023","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"7432","article-title":"Piqa: Reasoning about physical commonsense in natural language","volume":"vol. 34","author":"Bisk","year":"2020"},{"key":"10.1016\/j.neunet.2026.108900_bib0024","series-title":"Proceedings of the ninth workshop on statistical machine translation","first-page":"12","article-title":"Findings of the 2014 workshop on statistical machine translation","author":"Bojar","year":"2014"},{"key":"10.1016\/j.neunet.2026.108900_bib0025","first-page":"1877","article-title":"Language models are few-shot learners","volume":"33","author":"Brown","year":"2020","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0026","doi-asserted-by":"crossref","unstructured":"Chelba, C., Mikolov, T., Schuster, M., Ge, Q., Brants, T., Koehn, P., & Robinson, T. (2013). One billion word benchmark for measuring progress in statistical language modeling. arXiv: 1312.3005.","DOI":"10.21437\/Interspeech.2014-564"},{"key":"10.1016\/j.neunet.2026.108900_bib0027","unstructured":"Chen, H., Chen, R., Yi, Y., Quan, X., Li, C., Yan, M., & Zhang, J. (2024). Knowledge distillation of black-box large language models. arXiv: 2401.07013."},{"key":"10.1016\/j.neunet.2026.108900_bib0028","series-title":"Proceedings of the 63rd annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"10081","article-title":"EfficientQAT: Efficient quantization-aware training for large language models","author":"Chen","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0029","unstructured":"Chen, M., Tworek, J., Jun, H., Yuan, Q., Oliveira, P. H. P. D., Kaplan, J., Edwards, H., Burda, Y., Joseph, N., Brockman, G. et al. (2021). Evaluating large language models trained on code. arXiv: 2107.03374."},{"key":"10.1016\/j.neunet.2026.108900_bib0030","unstructured":"Chen, S., Wong, S., Chen, L., & Tian, Y. (2023). Extending context window of large language models via positional interpolation. arXiv: 2306.15595."},{"issue":"12","key":"10.1016\/j.neunet.2026.108900_bib0031","doi-asserted-by":"crossref","first-page":"10558","DOI":"10.1109\/TPAMI.2024.3447085","article-title":"A survey on deep neural network pruning: Taxonomy, comparison, analysis, and recommendations","volume":"46","author":"Cheng","year":"2024","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"10.1016\/j.neunet.2026.108900_bib0032","unstructured":"Chhawri, S., Mahadik, R., & Rooj, S. (2025). A systematic study of compression ordering for large language models. arXiv: 2511.19495."},{"key":"10.1016\/j.neunet.2026.108900_bib0033","unstructured":"Choromanski, K., Likhosherstov, V., Dohan, D., Song, X., Gane, A., Sarlos, T., Hawkins, P., Davis, J., Mohiuddin, A., Kaiser, L. et al. (2020). Rethinking attention with performers. arXiv: 2009.14794."},{"issue":"240","key":"10.1016\/j.neunet.2026.108900_bib0034","first-page":"1","article-title":"Palm: Scaling language modeling with pathways","volume":"24","author":"Chowdhery","year":"2023","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.108900_bib0035","unstructured":"Chua, V. S., Pan, Y., & Jain, N. (2024). Post-training statistical calibration for higher activation sparsity. arXiv: 2412.07174."},{"key":"10.1016\/j.neunet.2026.108900_bib0036","unstructured":"Clark, C., Lee, K., Chang, M.-W., Kwiatkowski, T., Collins, M., & Toutanova, K. (2019). Boolq: Exploring the surprising difficulty of natural yes\/no questions. arXiv: 1905.10044."},{"key":"10.1016\/j.neunet.2026.108900_bib0037","unstructured":"Clark, P., Cowhey, I., Etzioni, O., Khot, T., Sabharwal, A., Schoenick, C., & Tafjord, O. (2018). Think you have solved question answering? Try arc, the ai2 reasoning challenge. arXiv: 1803.05457."},{"key":"10.1016\/j.neunet.2026.108900_bib0038","unstructured":"Clement, C. B., Bierbaum, M., O\u2019Keeffe, K. P., & Alemi, A. A. (2019). On the use of arxiv as a dataset. arXiv: 1905.00075."},{"key":"10.1016\/j.neunet.2026.108900_bib0039","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R. et al. (2021). Training verifiers to solve math word problems. arXiv: 2110.14168."},{"key":"10.1016\/j.neunet.2026.108900_bib0040","series-title":"Proceedings of the 62nd annual meeting of the association for computational linguistics (volume 1: Long papers)","first-page":"1280","article-title":"DeepSeekMoE: Towards ultimate expert specialization in mixture-of-experts language models","author":"Dai","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0041","series-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","first-page":"2978","article-title":"Transformer-XL: Attentive language models beyond a fixed-length context","author":"Dai","year":"2019"},{"key":"10.1016\/j.neunet.2026.108900_bib0042","series-title":"The twelfth international conference on learning representations","article-title":"Flashattention-2: Faster attention with better parallelism and work partitioning","author":"Dao","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0043","doi-asserted-by":"crossref","first-page":"16344","DOI":"10.52202\/068431-1189","article-title":"Flashattention: Fast and memory-efficient exact attention with io-awareness","volume":"35","author":"Dao","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0044","series-title":"The thirteenth international conference on learning representations","article-title":"Improving language model distillation through hidden state matching","author":"Dasgupta","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0045","unstructured":"Dernoncourt, F., & Lee, J. Y. (2017). Pubmed 200k rct: A dataset for sequential sentence classification in medical abstracts. arXiv: 1710.06071."},{"key":"10.1016\/j.neunet.2026.108900_bib0046","first-page":"30318","article-title":"Gpt3. int8: 8-Bit matrix multiplication for transformers at scale","volume":"35","author":"Dettmers","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0047","series-title":"Proceedings of the 2019 conference of the north american chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers)","first-page":"4171","article-title":"Bert: Pre-training of deep bidirectional transformers for language understanding","author":"Devlin","year":"2019"},{"key":"10.1016\/j.neunet.2026.108900_bib0048","series-title":"Proceedings of the third international workshop on paraphrasing (IWP2005)","first-page":"N\/A","article-title":"Automatically constructing a corpus of sentential paraphrases","author":"Dolan","year":"2005"},{"key":"10.1016\/j.neunet.2026.108900_bib0049","series-title":"The thirteenth international conference on learning representations","article-title":"STBLLM: Breaking the 1-bit barrier with structured binary LLMs","author":"Dong","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0050","series-title":"Acl (1)","first-page":"102","article-title":"Bitdistiller: Unleashing the potential of sub-4-bit LLMs via self-distillation","author":"Du","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0051","series-title":"International conference on machine learning","first-page":"5547","article-title":"Glam: Efficient scaling of language models with mixture-of-experts","author":"Du","year":"2022"},{"key":"10.1016\/j.neunet.2026.108900_bib0052","series-title":"Forty-first international conference on machine learning","article-title":"Extreme compression of large language models via additive quantization","author":"Egiazarian","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0053","doi-asserted-by":"crossref","unstructured":"Fabbri, A. R., Li, I., She, T., Li, S., & Radev, D. R. (2019). Multi-news: A large-scale multi-document summarization dataset and abstractive hierarchical model. arXiv: 1906.01749.","DOI":"10.18653\/v1\/P19-1102"},{"issue":"120","key":"10.1016\/j.neunet.2026.108900_bib0054","first-page":"1","article-title":"Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity","volume":"23","author":"Fedus","year":"2022","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.108900_bib0055","first-page":"4475","article-title":"Optimal brain compression: A framework for accurate post-training quantization and pruning","volume":"35","author":"Frantar","year":"2022","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0056","series-title":"Proceedings of the 40th international conference on machine learning ICML\u201923","article-title":"SparseGPT: Massive language models can be accurately pruned in one-shot","author":"Frantar","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0057","unstructured":"Frantar, E., Ashkboos, S., Hoefler, T., & Alistarh, D. (2022). Gptq: Accurate post-training quantization for generative pre-trained transformers. arXiv: 2210.17323."},{"key":"10.1016\/j.neunet.2026.108900_bib0058","unstructured":"Gao, L., Biderman, S., Black, S., Golding, L., Hoppe, T., Foster, C., Phang, J., He, H., Thite, A., Nabeshima, N., Presser, S., & Leahy, C. (2020). The Pile: An 800GB dataset of diverse text for language modeling. arXiv: 2101.00027."},{"key":"10.1016\/j.neunet.2026.108900_bib0059","unstructured":"GLM, T., Zeng, A., Xu, B., Wang, B., Zhang, C., Yin, D., Zhang, D., Rojas, D., Feng, G., Zhao, H. et al. (2024). Chatglm: A family of large language models from glm-130b to glm-4 all tools. arXiv: 2406.12793."},{"key":"10.1016\/j.neunet.2026.108900_bib0060","unstructured":"Golovneva, O., Wang, T., Weston, J., & Sukhbaatar, S. (2024). Contextual position encoding: Learning to count what\u2019s important. arXiv: 2405.18719."},{"key":"10.1016\/j.neunet.2026.108900_bib0061","unstructured":"Golovneva, O., Wang, T., Weston, J., & Sukhbaatar, S. (2025). Multi-token attention. arXiv: 2504.00927."},{"key":"10.1016\/j.neunet.2026.108900_bib0062","series-title":"The twelfth international conference on learning representations","article-title":"MiniLLM: Knowledge distillation of large language models","author":"Gu","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0063","unstructured":"Guo, J., Chen, X., Tang, Y., & Wang, Y. (2025). SlimLLM: Accurate structured pruning for large language models. arXiv: 2505.22689."},{"key":"10.1016\/j.neunet.2026.108900_bib0064","unstructured":"Gupta, A., Guo, H., Yuan, Y., Zhou, Y., & Mendis, C. (2024). Flurka: Fast and accurate unified low-rank & kernel attention. arXiv: 2306.15799."},{"key":"10.1016\/j.neunet.2026.108900_bib0065","unstructured":"Hamman, F., Dissanayake, P., Fu, Y., & Dutta, S. (2025). Few-shot knowledge distillation of LLMs with counterfactual explanations. arXiv: 2510.21631."},{"key":"10.1016\/j.neunet.2026.108900_bib0066","unstructured":"He, P., Liu, X., Gao, J., & Chen, W. (2020). Deberta: Decoding-enhanced bert with disentangled attention. arXiv: 2006.03654."},{"key":"10.1016\/j.neunet.2026.108900_bib0067","unstructured":"Heimersheim, S. (2024). You can remove GPT2\u2019s layernorm by fine-tuning. arXiv: 2409.13710."},{"key":"10.1016\/j.neunet.2026.108900_bib0068","unstructured":"Hendrycks, D., Burns, C., Basart, S., Zou, A., Mazeika, M., Song, D., & Steinhardt, J. (2020). Measuring massive multitask language understanding. arXiv: 2009.03300."},{"key":"10.1016\/j.neunet.2026.108900_bib0069","unstructured":"Hendrycks, D., Burns, C., Kadavath, S., Arora, A., Basart, S., Tang, E., Song, D., & Steinhardt, J. (2021). Measuring mathematical problem solving with the math dataset. arXiv: 2103.03874."},{"key":"10.1016\/j.neunet.2026.108900_bib0070","first-page":"1693","article-title":"Teaching machines to read and comprehend","volume":"28","author":"Hermann","year":"2015","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0071","unstructured":"Ho, N., Schmid, L., & Yun, S.-Y. (2022). Large language models are reasoning teachers. arXiv: 2212.10071."},{"key":"10.1016\/j.neunet.2026.108900_bib0072","unstructured":"Hsieh, C.-P., Sun, S., Kriman, S., Acharya, S., Rekesh, D., Jia, F., Zhang, Y., & Ginsburg, B. (2024). Ruler: What\u2019s the real context size of your long-context language models?arXiv: 2404.06654."},{"key":"10.1016\/j.neunet.2026.108900_bib0073","series-title":"Findings of the association for computational linguistics: ACL 2023","first-page":"8003","article-title":"Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes","author":"Hsieh","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0074","series-title":"International conference on learning representations","first-page":"N\/A","article-title":"Language model compression with weighted low-rank factorization","author":"Hsu","year":"2022"},{"issue":"2","key":"10.1016\/j.neunet.2026.108900_bib0075","first-page":"3","article-title":"Lora: Low-rank adaptation of large language models","volume":"1","author":"Hu","year":"2022","journal-title":"ICLR"},{"key":"10.1016\/j.neunet.2026.108900_bib0076","unstructured":"Hu, H., Zhao, P., Li, P., Zheng, Y., Wang, Z., & Yuan, X. (2025a). Fasp: Fast and accurate structured pruning of large language models. arXiv: 2501.09412."},{"key":"10.1016\/j.neunet.2026.108900_bib0077","series-title":"The thirteenth international conference on learning representations","article-title":"OSTQuant: Refining large language model quantization with orthogonal and scaling transformations for better distribution fitting","author":"Hu","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0078","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"17494","article-title":"SoLA: Leveraging soft activation sparsity and low-rank decomposition for large language model compression","volume":"vol. 39","author":"Huang","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0079","unstructured":"Huang, Y., Chen, Y., Yu, Z., & McKeown, K. (2022). In-context learning distillation: Transferring few-shot learning ability of pre-trained language models. arXiv: 2212.10670."},{"key":"10.1016\/j.neunet.2026.108900_bib0080","series-title":"International conference on machine learning","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"Ioffe","year":"2015"},{"issue":"3","key":"10.1016\/j.neunet.2026.108900_bib0081","first-page":"3","article-title":"Phi-2: The surprising power of small language models","volume":"1","author":"Javaheripi","year":"2023","journal-title":"Microsoft Research Blog"},{"key":"10.1016\/j.neunet.2026.108900_bib0082","unstructured":"Jiang, A. Q., Sablayrolles, A., Mensch, A., Bamford, C., Chaplot, D. S., de las Casas, D., Bressand, F., Lengyel, G., Lample, G., Saulnier, L., Lavaud, L. R., Lachaux, M.-A., Stock, P., Scao, T. L., Lavril, T., Wang, T., Lacroix, T., & Sayed, W. E. (2023a). Mistral 7b. arXiv: 2310.06825."},{"key":"10.1016\/j.neunet.2026.108900_bib0083","unstructured":"Jiang, A. Q., Sablayrolles, A., Roux, A., Mensch, A., Savary, B., Bamford, C., Chaplot, D. S., de las, C. D., Hanna, E. B., Bressand, F. et al. (2024). Mixtral of experts. arXiv: 2401.04088."},{"key":"10.1016\/j.neunet.2026.108900_bib0084","series-title":"The 2023 conference on empirical methods in natural language processing","article-title":"Lion: Adversarial distillation of proprietary large language models","author":"Jiang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0085","doi-asserted-by":"crossref","unstructured":"Joshi, M., Choi, E., Weld, D. S., & Zettlemoyer, L. (2017). Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv: 1705.03551.","DOI":"10.18653\/v1\/P17-1147"},{"key":"10.1016\/j.neunet.2026.108900_bib0086","unstructured":"Kamradt, G. (2023). Needle in a haystack - pressure testing LLMs. Github. URL https:\/\/github.com\/gkamradt\/LLMTestNeedleInAHaystack\/tree\/main."},{"key":"10.1016\/j.neunet.2026.108900_bib0087","series-title":"International conference on machine learning","first-page":"5156","article-title":"Transformers are rnns: Fast autoregressive transformers with linear attention","author":"Katharopoulos","year":"2020"},{"key":"10.1016\/j.neunet.2026.108900_bib0088","series-title":"Icml 2024 workshop on foundation models in the wild","article-title":"LoRD: Low-rank decomposition of monolingual code LLMs for one-shot compression","author":"Kaushal","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0089","first-page":"42097","article-title":"Token-scaled logit distillation for ternary weight generative language models","volume":"36","author":"Kim","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"issue":"13","key":"10.1016\/j.neunet.2026.108900_bib0090","doi-asserted-by":"crossref","first-page":"3521","DOI":"10.1073\/pnas.1611835114","article-title":"Overcoming catastrophic forgetting in neural networks","volume":"114","author":"Kirkpatrick","year":"2017","journal-title":"Proceedings of the National Academy of Sciences"},{"key":"10.1016\/j.neunet.2026.108900_bib0091","series-title":"Forty-first international conference on machine learning","article-title":"DistiLLM: Towards streamlined distillation for large language models","author":"Ko","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0092","series-title":"2023\u202fIEEE International symposium on circuits and systems (ISCAS)","first-page":"1","article-title":"Hardware-efficient softmax approximation for self-attention networks","author":"Koca","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0093","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., & Soricut, R. (2019). Albert: A lite bert for self-supervised learning of language representations. arXiv: 1909.11942."},{"key":"10.1016\/j.neunet.2026.108900_bib0094","series-title":"2024 4th international conference on artificial intelligence, robotics, and communication (ICAIRC)","first-page":"224","article-title":"A comprehensive study on quantization techniques for large language models","author":"Lang","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0095","series-title":"International conference on representation learning","first-page":"72494","article-title":"Probe pruning: Accelerating LLMs through dynamic pruning via model-probing","volume":"vol. 2025","author":"Le","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0096","unstructured":"Le Scao, T., Fan, A., Akiki, C., Pavlick, E., Ili\u0107, S., Hesslow, D., Castagn\u00e9, R., Luccioni, A. S., Yvon, F., Gall\u00e9, M. et al. (2023). Bloom: A 176b-parameter open-access multilingual language model. arXiv: 2211.05100."},{"key":"10.1016\/j.neunet.2026.108900_bib0097","unstructured":"Lee, D., Lee, J.-Y., Zhang, G., Tiwari, M., & Mirhoseini, A. (2024). Cats: Contextually-aware thresholding for sparsity in large language models. arXiv: 2404.08763."},{"key":"10.1016\/j.neunet.2026.108900_bib0098","unstructured":"Lefaudeux, B., Massa, F., Liskovich, D., Xiong, W., Caggiano, V., Naren, S., Xu, M., Hu, J., Tintore, M., Zhang, S., Labatut, P., Haziza, D., Wehrstedt, L., Reizenstein, J., & Sizov, G. (2022). xformers: A modular and hackable transformer modelling library. https:\/\/github.com\/facebookresearch\/xformers."},{"key":"10.1016\/j.neunet.2026.108900_bib0099","series-title":"International conference on learning representations","article-title":"{GS}hard: Scaling giant models with conditional computation and automatic sharding","author":"Lepikhin","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0100","series-title":"International conference on machine learning","first-page":"6265","article-title":"Base layers: Simplifying training of large, sparse models","author":"Lewis","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0101","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"7871","article-title":"BART: Denoising sequence-to-sequence pre-training for natural language generation, translation, and comprehension","author":"Lewis","year":"2020"},{"key":"10.1016\/j.neunet.2026.108900_bib0102","unstructured":"Li, P., Yin, L., & Liu, S. (2024). Mix-ln: Unleashing the power of deeper layers by combining pre-ln and post-ln. arXiv: 2412.13795."},{"key":"10.1016\/j.neunet.2026.108900_bib0103","series-title":"International conference on machine learning","first-page":"20852","article-title":"Less is more: Task-aware layer-wise distillation for language model compression","author":"Liang","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0104","unstructured":"Liang, X., Wang, H., Lai, H., Niu, S., Song, S., Yang, J., Zhao, J., Xiong, F., Tang, B., & Li, Z. (2025). Seap: Training-free sparse expert activation pruning unlock the brainpower of large language models. arXiv: 2503.07605."},{"key":"10.1016\/j.neunet.2026.108900_bib0105","series-title":"Text summarization branches out","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","author":"Lin","year":"2004"},{"key":"10.1016\/j.neunet.2026.108900_bib0106","series-title":"Mlsys","article-title":"Awq: Activation-aware weight quantization for on-device llm compression and acceleration","author":"Lin","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0107","unstructured":"Liu, A., Feng, B., Xue, B., Wang, B., Wu, B., Lu, C., Zhao, C., Deng, C., Zhang, C., Ruan, C. et al. (2024a). Deepseek-v3 technical report. arXiv: 2412.19437."},{"key":"10.1016\/j.neunet.2026.108900_bib0108","unstructured":"Liu, J., Gong, R., Wei, X., Dong, Z., Cai, J., & Zhuang, B. (2023). Qllm: Accurate and efficient low-bitwidth quantization for large language models. arXiv: 2310.08041."},{"key":"10.1016\/j.neunet.2026.108900_bib0109","series-title":"International conference on machine learning","first-page":"7021","article-title":"Group fisher pruning for practical network compression","author":"Liu","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0110","unstructured":"Liu, S., He, B., Wu, H., & Song, L. (2025a). Optishear: Towards efficient and adaptive pruning of large language models via evolutionary optimization. arXiv: 2502.10735."},{"key":"10.1016\/j.neunet.2026.108900_bib0111","series-title":"Proceedings of the 43rd IEEE\/ACM international conference on computer-aided design","first-page":"1","article-title":"Consmax: Hardware-friendly alternative softmax with learnable parameters","author":"Liu","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0112","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., & Stoyanov, V. (2019). Roberta: A robustly optimized bert pretraining approach. arXiv: 1907.11692."},{"key":"10.1016\/j.neunet.2026.108900_bib0113","series-title":"Findings of the association for computational linguistics: ACL 2024","first-page":"467","article-title":"LLM-QAT: Data-free quantization aware training for large language models","author":"Liu","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0114","series-title":"The thirteenth international conference on learning representations","article-title":"Spinquant: LLM quantization with learned rotations","author":"Liu","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0115","unstructured":"Liu, Z., Zhao, C., Iandola, F., Lai, C., Tian, Y., Fedorov, I., Xiong, Y., Chang, E., Shi, Y., Krishnamoorthi, R., Lai, L., & Chandra, V. (2024d). MobileLLM: Optimizing sub-billion parameter language models for on-device use cases. arXiv: 2402.14905."},{"key":"10.1016\/j.neunet.2026.108900_bib0116","unstructured":"Loshchilov, I., Hsieh, C.-P., Sun, S., & Ginsburg, B. (2024). ngpt: Normalized transformer with representation learning on the hypersphere. arXiv: 2410.01131."},{"key":"10.1016\/j.neunet.2026.108900_bib0117","series-title":"Proceedings of the 48th international ACM SIGIR conference on research and development in information retrieval SIGIR \u201925","first-page":"9","article-title":"DiSCo: LLM knowledge distillation for efficient sparse retrieval in conversational search","author":"Lupart","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0118","unstructured":"Ma, S., Wang, H., Ma, L., Wang, L., Wang, W., Huang, S., Dong, L., Wang, R., Xue, J., & Wei, F. (2024). The era of 1-bit llms: All large language models are in 1.58 bits. arXiv: 2402.17764, 1."},{"key":"10.1016\/j.neunet.2026.108900_bib0119","first-page":"21702","article-title":"Llm-pruner: On the structural pruning of large language models","volume":"36","author":"Ma","year":"2023","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0120","first-page":"2441","article-title":"Luna: Linear unified nested attention","volume":"34","author":"Ma","year":"2021","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0121","unstructured":"Mahoney, M. (2011). Large text compression benchmark."},{"issue":"2","key":"10.1016\/j.neunet.2026.108900_bib0122","first-page":"313","article-title":"Building a large annotated corpus of english: The penn treebank","volume":"19","author":"Marcus","year":"1993","journal-title":"Computational Linguistics"},{"key":"10.1016\/j.neunet.2026.108900_bib0123","unstructured":"Merity, S., Xiong, C., Bradbury, J., & Socher, R. (2016a). Pointer sentinel mixture models. arXiv: 1609.07843."},{"key":"10.1016\/j.neunet.2026.108900_bib0124","unstructured":"Merity, S., Xiong, C., Bradbury, J., & Socher, R. (2016b). Pointer sentinel mixture models. arXiv: 1609.07843."},{"key":"10.1016\/j.neunet.2026.108900_bib0125","unstructured":"Milakov, M., & Gimelshein, N. (2018). Online normalizer calculation for softmax. arXiv: 1805.02867."},{"key":"10.1016\/j.neunet.2026.108900_bib0126","series-title":"Proceedings of the 16th international conference on spoken language translation","article-title":"Transformers without tears: Improving the normalization of self-attention","author":"Nguyen","year":"2019"},{"key":"10.1016\/j.neunet.2026.108900_bib0127","series-title":"The twelfth international conference on learning representations","article-title":"Skeleton-of-thought: Prompting LLMs for efficient parallel generation","author":"Ning","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0128","unstructured":"Open-Orca Contributors (2023). Openorca: Open-source instruction-following dataset. https:\/\/huggingface.co\/datasets\/Open-Orca\/OpenOrca. Hugging Face Dataset card; arXiv works associated include arXiv: 2306.02707, arXiv: 2301.13688, arXiv: 2302.13971."},{"key":"10.1016\/j.neunet.2026.108900_bib0129","unstructured":"Pagliardini, M., Paliotta, D., Jaggi, M., & Fleuret, F. (2023). Faster causal attention over large sequences through sparse flash attention. arXiv: 2306.01160."},{"key":"10.1016\/j.neunet.2026.108900_bib0130","series-title":"Proceedings of the 30th ACM SIGKDD conference on knowledge discovery and data mining","first-page":"6605","article-title":"Inference optimization of foundation models on ai accelerators","author":"Park","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0131","series-title":"The twelfth international conference on learning representations","article-title":"YaRN: Efficient context window extension of large language models","author":"Peng","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0132","unstructured":"Peng, H., Pappas, N., Yogatama, D., Schwartz, R., Smith, N. A., & Kong, L. (2021). Random feature attention. arXiv: 2103.02143."},{"key":"10.1016\/j.neunet.2026.108900_bib0133","series-title":"Proceedings of the tenth workshop on statistical machine translation","first-page":"392","article-title":"Chrf: Character n-gram f-score for automatic MT evaluation","author":"Popovi\u0107","year":"2015"},{"key":"10.1016\/j.neunet.2026.108900_bib0134","unstructured":"Rae, J. W., Potapenko, A., Jayakumar, S. M., & Lillicrap, T. P. (2019). Compressive transformers for long-range sequence modelling. arXiv: 1911.05507."},{"issue":"140","key":"10.1016\/j.neunet.2026.108900_bib0135","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"issue":"140","key":"10.1016\/j.neunet.2026.108900_bib0136","first-page":"1","article-title":"Exploring the limits of transfer learning with a unified text-to-text transformer","volume":"21","author":"Raffel","year":"2020","journal-title":"Journal of Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.108900_bib0137","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., & Liang, P. (2016). Squad: 100,000+ Questions for machine comprehension of text. arXiv: 1606.05250.","DOI":"10.18653\/v1\/D16-1264"},{"key":"10.1016\/j.neunet.2026.108900_bib0138","doi-asserted-by":"crossref","unstructured":"Reid, M., Marrese-Taylor, E., & Matsuo, Y. (2021). Subformer: Exploring weight sharing for parameter efficiency in generative transformers. arXiv: 2101.00234.","DOI":"10.18653\/v1\/2021.findings-emnlp.344"},{"issue":"9","key":"10.1016\/j.neunet.2026.108900_bib0139","doi-asserted-by":"crossref","first-page":"99","DOI":"10.1145\/3474381","article-title":"Winogrande: An adversarial winograd schema challenge at scale","volume":"64","author":"Sakaguchi","year":"2021","journal-title":"Communications of the ACM"},{"key":"10.1016\/j.neunet.2026.108900_bib0140","series-title":"Proceedings of the 2019 conference on empirical methods in natural language processing and the 9th international joint conference on natural language processing (EMNLP-IJCNLP)","first-page":"4463","article-title":"Social IQa: Commonsense reasoning about social interactions","author":"Sap","year":"2019"},{"key":"10.1016\/j.neunet.2026.108900_bib0141","unstructured":"Sharma, L., Graesser, L., Nangia, N., & Evci, U. (2019). Natural language understanding with the quora question pairs dataset. arXiv: 1907.01041."},{"key":"10.1016\/j.neunet.2026.108900_bib0142","series-title":"Proceedings of the 2018 conference of the north american chapter of the association for computational linguistics: Human language technologies, volume 2 (short papers)","first-page":"464","article-title":"Self-attention with relative position representations","author":"Shaw","year":"2018"},{"key":"10.1016\/j.neunet.2026.108900_bib0143","unstructured":"Shazeer, N. (1911). Fast transformer decoding: One write-head is all you need, 2019. https:\/\/arxiv.org\/abs."},{"key":"10.1016\/j.neunet.2026.108900_bib0144","unstructured":"Shazeer, N. (2020). Glu variants improve transformer. arXiv: 2002.05202."},{"key":"10.1016\/j.neunet.2026.108900_bib0145","series-title":"International conference on learning representations","article-title":"Outrageously large neural networks: The sparsely-gated mixture-of-experts layer","author":"Shazeer","year":"2017"},{"issue":"1\u20132","key":"10.1016\/j.neunet.2026.108900_bib0146","doi-asserted-by":"crossref","first-page":"1","DOI":"10.1561\/2200000068","article-title":"Introduction to multi-armed bandits","volume":"12","author":"Slivkins","year":"2019","journal-title":"Foundations and Trends\u00ae in Machine Learning"},{"key":"10.1016\/j.neunet.2026.108900_bib0147","series-title":"Proceedings of the 31st international conference on computational linguistics","first-page":"2626","article-title":"ProSparse: Introducing and enhancing intrinsic activation sparsity within large language models","author":"Song","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0148","series-title":"2021 58th ACM\/IEEE design automation conference (DAC)","first-page":"469","article-title":"Softermax: Hardware\/software co-design of an efficient softmax for transformers","author":"Stevens","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0149","doi-asserted-by":"crossref","DOI":"10.1016\/j.neucom.2023.127063","article-title":"Roformer: Enhanced transformer with rotary position embedding","volume":"568","author":"Su","year":"2024","journal-title":"Neurocomputing"},{"key":"10.1016\/j.neunet.2026.108900_bib0150","series-title":"The twelfth international conference on learning representations","article-title":"A simple and effective pruning approach for large language models","author":"Sun","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0151","unstructured":"Sun, M., Liu, Z., Bair, A., & Kolter, J. Z. (2024b). A simple and effective pruning approach for large language models. arXiv: 2306.11695."},{"key":"10.1016\/j.neunet.2026.108900_bib0152","series-title":"Proceedings of the 58th annual meeting of the association for computational linguistics","first-page":"2158","article-title":"MobileBERT: A compact task-agnostic BERT for resource-limited devices","author":"Sun","year":"2020"},{"key":"10.1016\/j.neunet.2026.108900_bib0153","unstructured":"Talmor, A., Herzig, J., Lourie, N., & Berant, J. (2018). Commonsenseqa: A question answering challenge targeting commonsense knowledge. arXiv: 1811.00937."},{"key":"10.1016\/j.neunet.2026.108900_bib0154","unstructured":"Tay, Y., Dehghani, M., Abnar, S., Shen, Y., Bahri, D., Pham, P., Rao, J., Yang, L., Ruder, S., & Metzler, D. (2020). Long range arena: A benchmark for efficient transformers. arXiv: 2011.04006."},{"issue":"6","key":"10.1016\/j.neunet.2026.108900_bib0155","doi-asserted-by":"crossref","DOI":"10.1145\/3530811","article-title":"Efficient transformers: A survey","volume":"55","author":"Tay","year":"2022","journal-title":"ACM Computing Surveys"},{"key":"10.1016\/j.neunet.2026.108900_bib0156","unstructured":"Thawakar, O., Vayani, A., Khan, S., Cholakal, H., Anwer, R. M., Felsberg, M., Baldwin, T., Xing, E. P., & Khan, F. S. (2024). Mobillama: Towards accurate and lightweight fully transparent gpt. arXiv: 2402.16840."},{"key":"10.1016\/j.neunet.2026.108900_bib0157","series-title":"Proceedings of the babyLM challenge at the 27th conference on computational natural language learning","first-page":"279","article-title":"Baby llama: Knowledge distillation from an ensemble of teachers trained on a small dataset with no performance penalty","author":"Timiryasov","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0158","unstructured":"Touvron, H., Lavril, T., Izacard, G., Martinet, X., Lachaux, M.-A., Lacroix, T., Rozi\u00e8re, B., Goyal, N., Hambro, E., Azhar, F. et al. (2023a). Llama: Open and efficient foundation language models. arXiv: 2302.13971."},{"key":"10.1016\/j.neunet.2026.108900_bib0159","unstructured":"Touvron, H., Martin, L., Stone, K., Albert, P., Almahairi, A., Babaei, Y., Bashlykov, N., Batra, S., Bhargava, P., Bhosale, S. et al. (2023b). Llama 2: Open foundation and fine-tuned chat models. arXiv: 2307.09288."},{"key":"10.1016\/j.neunet.2026.108900_bib0160","series-title":"Forty-first international conference on machine learning","article-title":"QuIP#: Even better LLM quantization with hadamard incoherence and lattice codebooks","author":"Tseng","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0161","first-page":"6000","article-title":"Attention is all you need","volume":"30","author":"Vaswani","year":"2017","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0162","unstructured":"Vicuna (2023). Vicuna: An open-source chatbot impressing GPT-4 with 90%* ChatGPT quality. https:\/\/lmsys.org\/blog\/2023-03-30-vicuna\/."},{"key":"10.1016\/j.neunet.2026.108900_bib0163","series-title":"Proceedings of the 57th annual meeting of the association for computational linguistics","first-page":"5797","article-title":"Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned","author":"Voita","year":"2019"},{"key":"10.1016\/j.neunet.2026.108900_bib0164","article-title":"Efficient large language models: A survey","author":"Wan","year":"2024","journal-title":"Transactions on Machine Learning Research"},{"key":"10.1016\/j.neunet.2026.108900_bib0165","series-title":"Proceedings of the 2018\u202fEMNLP workshop blackboxNLP: analyzing and interpreting neural networks for NLP","first-page":"353","article-title":"GLUE: A multi-task benchmark and analysis platform for natural language understanding","author":"Wang","year":"2018"},{"key":"10.1016\/j.neunet.2026.108900_bib0166","unstructured":"Wang, H., Ma, S., Dong, L., Huang, S., Wang, H., Ma, L., Yang, F., Wang, R., Wu, Y., & Wei, F. (2023). Bitnet: Scaling 1-bit transformers for large language models. arXiv: 2310.11453."},{"key":"10.1016\/j.neunet.2026.108900_bib0167","unstructured":"Wang, J., Chen, Y.-G., Lin, I.-C., Li, B., & Zhang, G. L. (2024). Basis sharing: Cross-layer parameter sharing for large language model compression. arXiv: 2410.03765."},{"key":"10.1016\/j.neunet.2026.108900_bib0168","unstructured":"Wang, S., Li, B. Z., Khabsa, M., Fang, H., & Ma, H. (2020). Linformer: Self-attention with linear complexity. arXiv: 2006.04768."},{"key":"10.1016\/j.neunet.2026.108900_bib0169","series-title":"Proceedings of the 2025 conference of the nations of the americas chapter of the association for computational linguistics: Human language technologies (volume 1: Long papers)","article-title":"SVD-LLM v2: Optimizing singular value truncation for large language model compression","author":"Wang","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0170","series-title":"The thirteenth international conference on learning representations","article-title":"SVD-LLM: Truncation-aware singular value decomposition for large language model compression","author":"Wang","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0171","series-title":"Proceedings of the 2023 conference on empirical methods in natural language processing","first-page":"1648","article-title":"Outlier suppression+: Accurate quantization of large language models by equivalent and effective shifting and scaling","author":"Wei","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0172","doi-asserted-by":"crossref","first-page":"287","DOI":"10.1162\/tacl_a_00021","article-title":"Constructing datasets for multi-hop reading comprehension across documents","volume":"6","author":"Welbl","year":"2018","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"10.1016\/j.neunet.2026.108900_bib0173","series-title":"International conference on machine learning","first-page":"38087","article-title":"Smoothquant: Accurate and efficient post-training quantization for large language models","author":"Xiao","year":"2023"},{"key":"10.1016\/j.neunet.2026.108900_bib0174","series-title":"International conference on machine learning","first-page":"10524","article-title":"On layer normalization in the transformer architecture","author":"Xiong","year":"2020"},{"key":"10.1016\/j.neunet.2026.108900_bib0175","series-title":"Proceedings of the AAAI conference on artificial intelligence","first-page":"14138","article-title":"Nystr\u00f6mformer: A nystr\u00f6m-based algorithm for approximating self-attention","volume":"vol. 35","author":"Xiong","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0176","first-page":"4381","article-title":"Understanding and improving layer normalization","volume":"32","author":"Xu","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0177","series-title":"The thirteenth international conference on learning representations","article-title":"Speculative knowledge distillation: Bridging the teacher-student gap through interleaved sampling","author":"Xu","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0178","unstructured":"Xu, X., Li, M., Tao, C., Shen, T., Cheng, R., Li, J., Xu, C., Tao, D., & Zhou, T. (2024a). A survey on knowledge distillation of large language models. CoRR, arXiv: 2402.13116."},{"key":"10.1016\/j.neunet.2026.108900_bib0179","series-title":"Advances in neural information processing systems","first-page":"66357","article-title":"Onebit: Towards extremely low-bit large language models","volume":"37","author":"Xu","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0180","series-title":"Proceedings of the 2021 conference of the north american chapter of the association for computational linguistics: Human language technologies","first-page":"483","article-title":"mT5: A massively multilingual pre-trained text-to-text transformer","author":"Xue","year":"2021"},{"key":"10.1016\/j.neunet.2026.108900_bib0181","doi-asserted-by":"crossref","unstructured":"Yao, J., & Gultepe, E. (2026). Spq: An ensemble technique for large language model compression. arXiv: 2602.18420.","DOI":"10.63317\/3t98siww4xf2"},{"key":"10.1016\/j.neunet.2026.108900_bib0182","unstructured":"Yuan, Z., Shang, Y., Song, Y., Wu, Q., Yan, Y., & Sun, G. (2023). Asvd: Activation-aware singular value decomposition for compressing large language models. arXiv: 2312.05821."},{"key":"10.1016\/j.neunet.2026.108900_bib0183","doi-asserted-by":"crossref","unstructured":"Zellers, R., Holtzman, A., Bisk, Y., Farhadi, A., & Choi, Y. (2019). Hellaswag: Can a machine really finish your sentence?arXiv: 1905.07830.","DOI":"10.18653\/v1\/P19-1472"},{"key":"10.1016\/j.neunet.2026.108900_bib0184","first-page":"12360","article-title":"Root mean square layer normalization","volume":"32","author":"Zhang","year":"2019","journal-title":"Advances in Neural Information Processing Systems"},{"key":"10.1016\/j.neunet.2026.108900_bib0185","unstructured":"Zhang, J., Wang, H., Jiang, K., Yang, S., Zheng, K., Xi, H., Wang, Z., Zhu, H., Zhao, M., Stoica, I. et al. (2025a). Sla: Beyond sparsity in diffusion transformers via fine-tunable sparse-linear attention. arXiv: 2509.24006."},{"key":"10.1016\/j.neunet.2026.108900_bib0186","series-title":"The thirteenth international conference on learning representations","article-title":"LoLCATs: On low-rank linearizing of large language models","author":"Zhang","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0187","unstructured":"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C., Diab, M., Li, X., Lin, X. V. et al. (2022a). Opt: Open pre-trained transformer language models. arXiv: 2205.01068."},{"key":"10.1016\/j.neunet.2026.108900_bib0188","series-title":"Proceedings of the 2024 conference on empirical methods in natural language processing","first-page":"18164","article-title":"Dual-space knowledge distillation for large language models","author":"Zhang","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0189","series-title":"The thirteenth international conference on learning representations","article-title":"Leanquant: Accurate and scalable large language model quantization with loss-error-aware grid","author":"Zhang","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0190","series-title":"Findings of the association for computational linguistics: ACL 2022","first-page":"877","article-title":"MoEfication: Transformer feed-forward layers are mixtures of experts","author":"Zhang","year":"2022"},{"key":"10.1016\/j.neunet.2026.108900_bib0191","unstructured":"Zhang, Z., Liu, Z., Tian, Y., Khaitan, H., Wang, Z., & Li, S. (2025c). R-sparse: Rank-aware activation sparsity for efficient llm inference. arXiv: 2504.19449."},{"key":"10.1016\/j.neunet.2026.108900_bib0192","unstructured":"Zhou, Z., Ning, X., Hong, K., Fu, T., Xu, J., Li, S., Lou, Y., Wang, L., Yuan, Z., Li, X., Yan, S., Dai, G., Zhang, X.-P., Dong, Y., & Wang, Y. (2024). A survey on efficient inference for large language models. CoRR, arXiv: 2404.14294."},{"key":"10.1016\/j.neunet.2026.108900_bib0193","series-title":"First conference on language modeling","article-title":"Starling-7b: Improving helpfulness and harmlessness with RLAIF","author":"Zhu","year":"2024"},{"key":"10.1016\/j.neunet.2026.108900_bib0194","doi-asserted-by":"crossref","unstructured":"Zhu, C., Liu, Y., Mei, J., & Zeng, M. (2021). Mediasum: A large-scale media interview dataset for dialogue summarization. arXiv: 2103.06410.","DOI":"10.18653\/v1\/2021.naacl-main.474"},{"key":"10.1016\/j.neunet.2026.108900_bib0195","series-title":"Proceedings of the computer vision and pattern recognition conference","first-page":"14901","article-title":"Transformers without normalization","author":"Zhu","year":"2025"},{"key":"10.1016\/j.neunet.2026.108900_bib0196","first-page":"8958","article-title":"Multimodal c4: An open, billion-scale corpus of images interleaved with text","volume":"36","author":"Zhu","year":"2023","journal-title":"Advances in Neural Information Processing Systems"}],"container-title":["Neural Networks"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026003618?httpAccept=text\/xml","content-type":"text\/xml","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/api.elsevier.com\/content\/article\/PII:S0893608026003618?httpAccept=text\/plain","content-type":"text\/plain","content-version":"vor","intended-application":"text-mining"}],"deposited":{"date-parts":[[2026,6,11]],"date-time":"2026-06-11T03:12:42Z","timestamp":1781147562000},"score":1,"resource":{"primary":{"URL":"https:\/\/linkinghub.elsevier.com\/retrieve\/pii\/S0893608026003618"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,9]]},"references-count":196,"alternative-id":["S0893608026003618"],"URL":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108900","relation":{},"ISSN":["0893-6080"],"issn-type":[{"value":"0893-6080","type":"print"}],"subject":[],"published":{"date-parts":[[2026,9]]},"assertion":[{"value":"Elsevier","name":"publisher","label":"This article is maintained by"},{"value":"Towards efficient language giants: A comprehensive survey on structural optimizations and compression techniques for large language models","name":"articletitle","label":"Article Title"},{"value":"Neural Networks","name":"journaltitle","label":"Journal Title"},{"value":"https:\/\/doi.org\/10.1016\/j.neunet.2026.108900","name":"articlelink","label":"CrossRef DOI link to publisher maintained version"},{"value":"article","name":"content_type","label":"Content Type"},{"value":"\u00a9 2026 Elsevier Ltd. All rights are reserved, including those for text and data mining, AI training, and similar technologies.","name":"copyright","label":"Copyright"}],"article-number":"108900"}}