{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,12,31]],"date-time":"2025-12-31T12:06:21Z","timestamp":1767182781487,"version":"3.44.0"},"reference-count":71,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:00:00Z","timestamp":1747180800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T00:00:00Z","timestamp":1747180800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"name":"Pioneer R&D Program of Zhejiang","award":["No.2024C01021"],"award-info":[{"award-number":["No.2024C01021"]}]},{"name":"Major Research Program of Zhejiang Provincial Natural Science Foundation","award":["No.LD24F020015"],"award-info":[{"award-number":["No.LD24F020015"]}]},{"name":"OPPO Research Fund"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["The VLDB Journal"],"published-print":{"date-parts":[[2025,7]]},"DOI":"10.1007\/s00778-025-00919-7","type":"journal-article","created":{"date-parts":[[2025,5,14]],"date-time":"2025-05-14T01:38:36Z","timestamp":1747186716000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["HMI: hierarchical knowledge management for efficient multi-tenant inference in pretrained language models"],"prefix":"10.1007","volume":"34","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-5245-7956","authenticated-orcid":false,"given":"Jun","family":"Zhang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jue","family":"Wang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Huan","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Lidan","family":"Shou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ke","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gang","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Qin","family":"Xie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Guiming","family":"Xie","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Xuejian","family":"Gong","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2025,5,14]]},"reference":[{"key":"919_CR1","doi-asserted-by":"crossref","unstructured":"Baek, E., Kwon, D., Kim, J.: A multi-neural network acceleration architecture. In: 2020 ACM\/IEEE 47th Annual International Symposium on Computer Architecture (ISCA), pp. 940\u2013953. IEEE (2020)","DOI":"10.1109\/ISCA45697.2020.00081"},{"key":"919_CR2","doi-asserted-by":"publisher","unstructured":"Bai, H., Zhang, W., Hou, L., Shang, L., Jin, J., Jiang, X., Liu, Q., Lyu, M., King, I.: BinaryBERT: Pushing the limit of BERT quantization. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4334\u20134348. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.334","DOI":"10.18653\/v1\/2021.acl-long.334"},{"key":"919_CR3","unstructured":"Bai, J., Bai, S., Chu, Y., et al.: Qwen technical report. arXiv preprint arXiv:2309.16609 (2023)"},{"key":"919_CR4","doi-asserted-by":"publisher","unstructured":"Beltagy, I., Lo, K., Cohan, A.: SciBERT: A pretrained language model for scientific text. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 3615\u20133620. Association for Computational Linguistics, Hong Kong, China (2019). https:\/\/doi.org\/10.18653\/v1\/D19-1371","DOI":"10.18653\/v1\/D19-1371"},{"key":"919_CR5","doi-asserted-by":"crossref","unstructured":"Bergsma, S., Zeyl, T., Senderovich, A., Beck, J.C.: Generating complex, realistic cloud workloads using recurrent neural networks. In: Proceedings of the ACM SIGOPS 28th Symposium on Operating Systems Principles, pp. 376\u2013391 (2021)","DOI":"10.1145\/3477132.3483590"},{"key":"919_CR6","unstructured":"Bommasani, R., Hudson, D.A., Adeli, E., Altman, R., Arora, S., von Arx, S., Bernstein, M.S., Bohg, J., Bosselut, A., Brunskill, E., et\u00a0al.: On the opportunities and risks of foundation models. ArXiv preprint arXiv:abs\/2108.07258 (2021)"},{"key":"919_CR7","unstructured":"Brown, T.B., Mann, B., Ryder, N., Subbiah, M., Kaplan, J., Dhariwal, P., Neelakantan, A., Shyam, P., Sastry, G., Askell, A., Agarwal, S., Herbert-Voss, A., Krueger, G., Henighan, T., Child, R., Ramesh, A., Ziegler, D.M., Wu, J., Winter, C., Hesse, C., Chen, M., Sigler, E., Litwin, M., Gray, S., Chess, B., Clark, J., Berner, C., McCandlish, S., Radford, A., Sutskever, I., Amodei, D.: Language models are few-shot learners. In: H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0Balcan, H.\u00a0Lin (eds.) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/1457c0d6bfcb 4967418bfb8ac142f64a-Abstract.html"},{"key":"919_CR8","unstructured":"Chandra, A., Stefanus, R.: Experiments on paraphrase identification using quora question pairs dataset (2020). arXiv:2006.02648"},{"key":"919_CR9","doi-asserted-by":"crossref","unstructured":"Chard, R., Li, Z., Chard, K., Ward, L., Babuji, Y., Woodard, A., Tuecke, S., Blaiszik, B., Franklin, M.J., Foster, I.: Dlhub: Model and data serving for science. In: 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS), pp. 283\u2013292. IEEE (2019)","DOI":"10.1109\/IPDPS.2019.00038"},{"key":"919_CR10","unstructured":"Chen, L., Ye, Z., Wu, Y., Zhuo, D., Ceze, L., Krishnamurthy, A.: Punica: Multi-tenant lora serving. arXiv preprint arXiv:2310.18547 (2023)"},{"key":"919_CR11","doi-asserted-by":"publisher","unstructured":"Chen, Z., Sadhukhan, R., Ye, Z., Zhou, Y., Zhang, J., Nolte, N., Tian, Y., Douze, M., Bottou, L., Jia, Z., Chen, B.: Magicpig: LSH sampling for efficient LLM generation. CoRR arXiv:abs\/2410.16179 (2024). https:\/\/doi.org\/10.48550\/ARXIV.2410.16179","DOI":"10.48550\/ARXIV.2410.16179"},{"key":"919_CR12","unstructured":"Cobbe, K., Kosaraju, V., Bavarian, M., Chen, M., Jun, H., Kaiser, L., Plappert, M., Tworek, J., Hilton, J., Nakano, R., Hesse, C., Schulman, J.: Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168 (2021)"},{"key":"919_CR13","unstructured":"Crankshaw, D., Wang, X., Zhou, G., Franklin, M.J., Gonzalez, J.E., Stoica, I.: Clipper: A $$\\{$$Low-Latency$$\\}$$ online prediction serving system. In: 14th USENIX Symposium on Networked Systems Design and Implementation (NSDI 17), pp. 613\u2013627 (2017)"},{"key":"919_CR14","doi-asserted-by":"publisher","unstructured":"Devlin, J., Chang, M.W., Lee, K., Toutanova, K.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers), pp. 4171\u20134186. Association for Computational Linguistics, Minneapolis, Minnesota (2019). https:\/\/doi.org\/10.18653\/v1\/N19-1423.https:\/\/aclanthology.org\/N19-1423","DOI":"10.18653\/v1\/N19-1423."},{"key":"919_CR15","unstructured":"Gao, L., Biderman, S., Black, S., Golding, L., Hoppe, T., Foster, C., Phang, J., He, H., Thite, A., Nabeshima, N., Presser, S., Leahy, C.: The pile: An 800gb dataset of diverse text for language modeling (2020). arxiv:2101.00027"},{"key":"919_CR16","unstructured":"Han, S., Mao, H., Dally, W.J.: Deep compression: Compressing deep neural networks with pruning, trained quantization and huffman coding. In: International Conference on Learning Representations (2016)"},{"key":"919_CR17","unstructured":"He, J., Zhou, C., Ma, X., Berg-Kirkpatrick, T., Neubig, G.: Towards a unified view of parameter-efficient transfer learning. In: International Conference on Learning Representations (2021)"},{"key":"919_CR18","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. ArXiv preprint arxiv:abs\/1503.02531 (2015)"},{"key":"919_CR19","unstructured":"Houlsby, N., Giurgiu, A., Jastrzebski, S., Morrone, B., de\u00a0Laroussilhe, Q., Gesmundo, A., Attariyan, M., Gelly, S.: Parameter-efficient transfer learning for NLP. In: K.\u00a0Chaudhuri, R.\u00a0Salakhutdinov (eds.) Proceedings of the 36th International Conference on Machine Learning, ICML 2019, 9-15 June 2019, Long Beach, California, USA, Proceedings of Machine Learning Research, vol.\u00a097, pp. 2790\u20132799. PMLR (2019). http:\/\/proceedings.mlr.press\/v97\/houlsby19a.html"},{"key":"919_CR20","doi-asserted-by":"publisher","unstructured":"Howard, J., Ruder, S.: Universal language model fine-tuning for text classification. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 328\u2013339. Association for Computational Linguistics, Melbourne, Australia (2018). https:\/\/doi.org\/10.18653\/v1\/P18-1031 https:\/\/aclanthology.org\/P18-1031","DOI":"10.18653\/v1\/P18-1031"},{"key":"919_CR21","unstructured":"Hu, E.J., Wallis, P., Allen-Zhu, Z., Li, Y., Wang, S., Wang, L., Chen, W., et\u00a0al.: Lora: Low-rank adaptation of large language models. In: International Conference on Learning Representations (2021)"},{"key":"919_CR22","unstructured":"Huang, K., Altosaar, J., Ranganath, R.: Clinicalbert: Modeling clinical notes and predicting hospital readmission. ArXiv preprint arxiv:abs\/1904.05342 (2019). https:\/\/arxiv.org\/abs\/1904.05342"},{"key":"919_CR23","doi-asserted-by":"crossref","unstructured":"Iandola, F., Shaw, A., Krishna, R., Keutzer, K.: SqueezeBERT: What can computer vision teach NLP about efficient neural networks? In: Proceedings of SustaiNLP: Workshop on Simple and Efficient Natural Language Processing, pp. 124\u2013135. Association for Computational Linguistics, Online (2020). doi:10.18653\/v1\/2020.sustainlp-1.17. https:\/\/aclanthology.org\/2020.sustainlp-1.17","DOI":"10.18653\/v1\/2020.sustainlp-1.17"},{"key":"919_CR24","doi-asserted-by":"publisher","unstructured":"Jawahar, G., Sagot, B., Seddah, D.: What does BERT learn about the structure of language? In: Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pp. 3651\u20133657. Association for Computational Linguistics, Florence, Italy (2019). https:\/\/doi.org\/10.18653\/v1\/P19-1356 https:\/\/aclanthology.org\/P19-1356","DOI":"10.18653\/v1\/P19-1356"},{"key":"919_CR25","doi-asserted-by":"publisher","unstructured":"Jiao, X., Yin, Y., Shang, L., Jiang, X., Chen, X., Li, L., Wang, F., Liu, Q.: TinyBERT: Distilling BERT for natural language understanding. In: Findings of the Association for Computational Linguistics: EMNLP 2020, pp. 4163\u20134174. Association for Computational Linguistics, Online (2020). https:\/\/doi.org\/10.18653\/v1\/2020.findings-emnlp.372. https:\/\/aclanthology.org\/2020.findings-emnlp.372","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"919_CR26","doi-asserted-by":"publisher","unstructured":"Jin, X., Zhang, D., Zhu, H., Xiao, W., Li, S.W., Wei, X., Arnold, A., Ren, X.: Lifelong pretraining: Continually adapting language models to emerging corpora. In: Proceedings of BigScience Episode #5 \u2013 Workshop on Challenges & Perspectives in Creating Large Language Models, pp. 1\u201316. Association for Computational Linguistics, virtual+Dublin (2022). https:\/\/doi.org\/10.18653\/v1\/2022.bigscience-1.1. https:\/\/aclanthology.org\/2022.bigscience-1.1","DOI":"10.18653\/v1\/2022.bigscience-1.1"},{"key":"919_CR27","unstructured":"Kim, S., Gholami, A., Yao, Z., Mahoney, M.W., Keutzer, K.: I-BERT: integer-only BERT quantization. In: M.\u00a0Meila, T.\u00a0Zhang (eds.) Proceedings of the 38th International Conference on Machine Learning, ICML 2021, 18-24 July 2021, Virtual Event, Proceedings of Machine Learning Research, vol. 139, pp. 5506\u20135518. PMLR (2021). http:\/\/proceedings.mlr.press\/v139\/kim21d.html"},{"key":"919_CR28","doi-asserted-by":"publisher","unstructured":"Kumar, A., Boehm, M., Yang, J.: Data management in machine learning: Challenges, techniques, and systems. In: S.\u00a0Salihoglu, W.\u00a0Zhou, R.\u00a0Chirkova, J.\u00a0Yang, D.\u00a0Suciu (eds.) Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 14-19, 2017, pp. 1717\u20131722. ACM (2017). https:\/\/doi.org\/10.1145\/3035918.3054775","DOI":"10.1145\/3035918.3054775"},{"key":"919_CR29","unstructured":"Lan, Z., Chen, M., Goodman, S., Gimpel, K., Sharma, P., Soricut, R.: ALBERT: A lite BERT for self-supervised learning of language representations. In: 8th International Conference on Learning Representations, ICLR 2020, Addis Ababa, Ethiopia, April 26-30, 2020. OpenReview.net (2020). https:\/\/openreview.net\/forum?id=H1eA7AEtvS"},{"issue":"4","key":"919_CR30","doi-asserted-by":"publisher","first-page":"1234","DOI":"10.1093\/bioinformatics\/btz682","volume":"36","author":"J Lee","year":"2020","unstructured":"Lee, J., Yoon, W., Kim, S., Kim, D., Kim, S., So, C.H., Kang, J.: Biobert: a pre-trained biomedical language representation model for biomedical text mining. Bioinformatics 36(4), 1234\u20131240 (2020)","journal-title":"Bioinformatics"},{"key":"919_CR31","unstructured":"Lee, Y., Scolari, A., Chun, B.G., Santambrogio, M.D., Weimer, M., Interlandi, M.: $$\\{$$PRETZEL$$\\}$$: Opening the black box of machine learning prediction serving systems. In: 13th USENIX Symposium on Operating Systems Design and Implementation (OSDI 18), pp. 611\u2013626 (2018)"},{"key":"919_CR32","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R., Constant, N.: The power of scale for parameter-efficient prompt tuning. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 3045\u20133059. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic (2021). doi:10.18653\/v1\/2021.emnlp-main.243. https:\/\/aclanthology.org\/2021.emnlp-main.243","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"919_CR33","doi-asserted-by":"crossref","unstructured":"Li, J., Liu, X., Zhao, H., Xu, R., Yang, M., Jin, Y.: BERT-EMD: Many-to-many layer mapping for BERT compression with earth mover\u2019s distance. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 3009\u20133018. Association for Computational Linguistics, Online (2020). doi:10.18653\/v1\/2020.emnlp-main.242. https:\/\/aclanthology.org\/2020.emnlp-main.242","DOI":"10.18653\/v1\/2020.emnlp-main.242"},{"key":"919_CR34","doi-asserted-by":"crossref","unstructured":"Li, L., Nakandala, S., Kumar, A.: Intermittent human-in-the-loop model selection using cerebro: a demonstration. Proceedings of the VLDB Endowment 14(12) (2021)","DOI":"10.14778\/3476311.3476320"},{"issue":"11","key":"919_CR35","doi-asserted-by":"publisher","first-page":"2327","DOI":"10.14778\/3476249.3476284","volume":"14","author":"S Li","year":"2021","unstructured":"Li, S., Kumar, A.: Towards an optimized group by abstraction for large-scale machine learning. Proc. VLDB Endow. 14(11), 2327\u20132340 (2021)","journal-title":"Proc. VLDB Endow."},{"key":"919_CR36","doi-asserted-by":"publisher","unstructured":"Li, X.L., Liang, P.: Prefix-tuning: Optimizing continuous prompts for generation. In: Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pp. 4582\u20134597. Association for Computational Linguistics, Online (2021). https:\/\/doi.org\/10.18653\/v1\/2021.acl-long.353.https:\/\/aclanthology.org\/2021.acl-long.353","DOI":"10.18653\/v1\/2021.acl-long.353."},{"key":"919_CR37","doi-asserted-by":"crossref","unstructured":"Li, Y., Han, Z., Zhang, Q., Li, Z., Tan, H.: Automating cloud deployment for deep learning inference of real-time online services. In: IEEE INFOCOM 2020-IEEE Conference on Computer Communications, pp. 1668\u20131677. IEEE (2020)","DOI":"10.1109\/INFOCOM41043.2020.9155267"},{"key":"919_CR38","unstructured":"Liu, Y., Ott, M., Goyal, N., Du, J., Joshi, M., Chen, D., Levy, O., Lewis, M., Zettlemoyer, L., Stoyanov, V.: Roberta: A robustly optimized bert pretraining approach. arXiv preprint (2019)"},{"key":"919_CR39","doi-asserted-by":"crossref","unstructured":"Liu, Z., Leng, J., Zhang, Z., Chen, Q., Li, C., Guo, M.: Veltair: towards high-performance multi-tenant deep learning services via adaptive compilation and scheduling. In: Proceedings of the 27th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, pp. 388\u2013401 (2022)","DOI":"10.1145\/3503222.3507752"},{"key":"919_CR40","doi-asserted-by":"crossref","unstructured":"Lo, K., Wang, L.L., Neumann, M., Kinney, R., Weld, D.: S2ORC: The semantic scholar open research corpus. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 4969\u20134983. Association for Computational Linguistics, Online (2020). doi:10.18653\/v1\/2020.acl-main.447. https:\/\/aclanthology.org\/2020.acl-main.447","DOI":"10.18653\/v1\/2020.acl-main.447"},{"issue":"12","key":"919_CR41","doi-asserted-by":"publisher","first-page":"2159","DOI":"10.14778\/3407790.3407816","volume":"13","author":"S Nakandala","year":"2020","unstructured":"Nakandala, S., Zhang, Y., Kumar, A.: Cerebro: a data system for optimized deep learning model selection. Proc. VLDB Endow. 13(12), 2159\u20132173 (2020)","journal-title":"Proc. VLDB Endow."},{"key":"919_CR42","doi-asserted-by":"crossref","unstructured":"Narayan, S., Cohen, S.B., Lapata, M.: Don\u2019t give me the details, just the summary! topic-aware convolutional neural networks for extreme summarization. arXiv:abs\/1808.08745 (2018)","DOI":"10.18653\/v1\/D18-1206"},{"key":"919_CR43","unstructured":"Olston, C., Fiedel, N., Gorovoy, K., Harmsen, J., Lao, L., Li, F., Rajashekhar, V., Ramesh, S., Soyke, J.: Tensorflow-serving: Flexible, high-performance ml serving. arXiv preprint:abs\/1712.06139 (2017)"},{"key":"919_CR44","unstructured":"Paster, K., Santos, M.D., Azerbayev, Z., Ba, J.: Openwebmath: An open dataset of high-quality mathematical web text (2023). https:\/\/arxiv.org\/abs\/2310.06786"},{"key":"919_CR45","unstructured":"Qin, H., Ding, Y., Zhang, M., Qinghua, Y., Liu, A., Dang, Q., Liu, Z., Liu, X.: Bibert: Accurate fully binarized bert. In: International Conference on Learning Representations (2021)"},{"key":"919_CR46","volume-title":"Improving language understanding by generative pre-training","author":"A Radford","year":"2018","unstructured":"Radford, A., Narasimhan, K., Salimans, T., Sutskever, I.: Improving language understanding by generative pre-training. Tech. rep, OpenAI (2018)"},{"issue":"5","key":"919_CR47","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1109\/72.248452","volume":"4","author":"R Reed","year":"1993","unstructured":"Reed, R.: Pruning algorithms-a survey. IEEE Trans. Neural Netw. 4(5), 740\u2013747 (1993)","journal-title":"IEEE Trans. Neural Netw."},{"key":"919_CR48","doi-asserted-by":"publisher","unstructured":"Rogers, A., Kovaleva, O., Rumshisky, A.: A primer in BERTology: What we know about how BERT works. Transactions of the Association for Computational Linguistics 8, 842\u2013866 (2020). https:\/\/doi.org\/10.1162\/tacl_a_00349.https:\/\/aclanthology.org\/2020.tacl-1.54","DOI":"10.1162\/tacl_a_00349."},{"key":"919_CR49","doi-asserted-by":"publisher","unstructured":"R\u00fcckl\u00e9, A., Geigle, G., Glockner, M., Beck, T., Pfeiffer, J., Reimers, N., Gurevych, I.: AdapterDrop: On the efficiency of adapters in transformers. In: Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pp. 7930\u20137946. Association for Computational Linguistics, Online and Punta Cana, Dominican Republic (2021). https:\/\/doi.org\/10.18653\/v1\/2021.emnlp-main.626.https:\/\/aclanthology.org\/2021.emnlp-main.626","DOI":"10.18653\/v1\/2021.emnlp-main.626."},{"key":"919_CR50","unstructured":"Sanh, V., Debut, L., Chaumond, J., Wolf, T.: Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. ArXiv preprint arxiv:1910.01108 (2019)"},{"key":"919_CR51","unstructured":"Schelter, S., Biessmann, F., Januschowski, T., Salinas, D., Seufert, S., Szarvas, G.: On challenges in machine learning model management. IEEE Data Engineering Bulletin (2015). https:\/\/www.amazon.science\/publications\/on-challenges-in-machine-learning-model-management"},{"key":"919_CR52","unstructured":"Sheng, Y., Cao, S., Li, D., Hooper, C., Lee, N., Yang, S., Chou, C., Zhu, B., Zheng, L., Keutzer, K., et\u00a0al.: S-lora: Serving thousands of concurrent lora adapters. arXiv preprint arXiv:2311.03285 (2023)"},{"key":"919_CR53","unstructured":"Stephenson, C., suchismita padhy, Ganesh, A., Hui, Y., Tang, H., Chung, S.: On the geometry of generalization and memorization in deep neural networks. In: International Conference on Learning Representations (2021). https:\/\/openreview.net\/forum?id=V8jrrnwGbuc"},{"key":"919_CR54","doi-asserted-by":"crossref","unstructured":"Sun, S., Cheng, Y., Gan, Z., Liu, J.: Patient knowledge distillation for BERT model compression. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp. 4323\u20134332. Association for Computational Linguistics, Hong Kong, China (2019). doi:10.18653\/v1\/D19-1441. https:\/\/aclanthology.org\/D19-1441","DOI":"10.18653\/v1\/D19-1441"},{"key":"919_CR55","doi-asserted-by":"crossref","unstructured":"Sun, Z., Yu, H., Song, X., Liu, R., Yang, Y., Zhou, D.: MobileBERT: a compact task-agnostic BERT for resource-limited devices. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pp. 2158\u20132170. Association for Computational Linguistics, Online (2020). doi:10.18653\/v1\/2020.acl-main.195. https:\/\/aclanthology.org\/2020.acl-main.195","DOI":"10.18653\/v1\/2020.acl-main.195"},{"key":"919_CR56","unstructured":"Touvron, H., Martin, L., Stone, K., et al.: Llama 2: Open foundation and fine-tuned chat models (2023)"},{"key":"919_CR57","unstructured":"Vartak, M.: MODELDB: A system for machine learning model management. In: CIDR 2017, 8th Biennial Conference on Innovative Data Systems Research, Chaminade, CA, USA, January 8-11, 2017, Online Proceedings. www.cidrdb.org (2017). http:\/\/cidrdb.org\/cidr2017\/gongshow\/abstracts\/cidr2017_112.pdf"},{"key":"919_CR58","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: I.\u00a0Guyon, U.\u00a0von Luxburg, S.\u00a0Bengio, H.M. Wallach, R.\u00a0Fergus, S.V.N. Vishwanathan, R.\u00a0Garnett (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, pp. 5998\u20136008 (2017)"},{"key":"919_CR59","unstructured":"Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: I.\u00a0Guyon, U.\u00a0von Luxburg, S.\u00a0Bengio, H.M. Wallach, R.\u00a0Fergus, S.V.N. Vishwanathan, R.\u00a0Garnett (eds.) Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, USA, pp. 5998\u20136008 (2017). https:\/\/proceedings.neurips.cc\/paper\/2017\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"919_CR60","doi-asserted-by":"publisher","unstructured":"Wang, J., Chen, K., Chen, G., Shou, L., McAuley, J.: SkipBERT: Efficient inference with shallow layer skipping. In: Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp. 7287\u20137301. Association for Computational Linguistics, Dublin, Ireland (2022). https:\/\/doi.org\/10.18653\/v1\/2022.acl-long.503.https:\/\/aclanthology.org\/2022.acl-long.503","DOI":"10.18653\/v1\/2022.acl-long.503."},{"key":"919_CR61","doi-asserted-by":"crossref","unstructured":"Wang, J., Chen, K., Shou, L., Jiang, D., Chen, G.: Smile: A cost-effective system for serving massive pretrained language models in the cloud (demo). In: Proceedings of the 2023 International Conference on Management of Data (2023)","DOI":"10.1145\/3555041.3589720"},{"key":"919_CR62","unstructured":"Wang, W., Wei, F., Dong, L., Bao, H., Yang, N., Zhou, M.: Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. In: H.\u00a0Larochelle, M.\u00a0Ranzato, R.\u00a0Hadsell, M.\u00a0Balcan, H.\u00a0Lin (eds.) Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual (2020). https:\/\/proceedings.neurips.cc\/paper\/2020\/hash\/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html"},{"key":"919_CR63","unstructured":"Xiao, G., Tian, Y., Chen, B., Han, S., Lewis, M.: Efficient streaming language models with attention sinks. In: The Twelfth International Conference on Learning Representations, ICLR 2024, Vienna, Austria, May 7-11, 2024. OpenReview.net (2024). https:\/\/openreview.net\/forum?id=NG7sS51zVF"},{"key":"919_CR64","doi-asserted-by":"crossref","unstructured":"Xue, J., Birke, R., Chen, L.Y., Smirni, E.: Managing data center tickets: Prediction and active sizing. In: 2016 46th Annual IEEE\/IFIP International Conference on Dependable Systems and Networks (DSN), pp. 335\u2013346. IEEE (2016)","DOI":"10.1109\/DSN.2016.38"},{"key":"919_CR65","unstructured":"Yang, Z., Dai, Z., Yang, Y., Carbonell, J.G., Salakhutdinov, R., Le, Q.V.: Xlnet: Generalized autoregressive pretraining for language understanding. In: H.M. Wallach, H.\u00a0Larochelle, A.\u00a0Beygelzimer, F.\u00a0d\u2019Alch\u00e9-Buc, E.B. Fox, R.\u00a0Garnett (eds.) Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems 2019, NeurIPS 2019, December 8-14, 2019, Vancouver, BC, Canada, pp. 5754\u20135764 (2019). https:\/\/proceedings.neurips.cc\/paper\/2019\/hash\/dc6a7e655d 7e5840e66733e9ee67cc69-Abstract.html"},{"key":"919_CR66","doi-asserted-by":"crossref","unstructured":"Zafrir, O., Boudoukh, G., Izsak, P., Wasserblat, M.: Q8bert: Quantized 8bit bert. In: 2019 Fifth Workshop on Energy Efficient Machine Learning and Cognitive Computing-NeurIPS Edition (EMC2-NIPS), pp. 36\u201339. IEEE (2019)","DOI":"10.1109\/EMC2-NIPS53020.2019.00016"},{"key":"919_CR67","unstructured":"Zhang, C., Yu, M., Wang, W., Yan, F.: $$\\{$$MArk$$\\}$$: Exploiting cloud services for $$\\{$$Cost-Effective$$\\}$$,$$\\{$$SLO-Aware$$\\}$$ machine learning inference serving. In: 2019 USENIX Annual Technical Conference (USENIX ATC 19), pp. 1049\u20131062 (2019)"},{"key":"919_CR68","doi-asserted-by":"crossref","unstructured":"Zhang, J., Wang, J., Li, H., Shou, L., Chen, K., Chen, G., Mehrotra, S.: Draft & verify: Lossless large language model acceleration via self-speculative decoding. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), ACL 2024, Bangkok, Thailand, August 11-16, 2024 (2024)","DOI":"10.18653\/v1\/2024.acl-long.607"},{"key":"919_CR69","unstructured":"Zhang, S., Roller, S., Goyal, N., Artetxe, M., Chen, M., Chen, S., Dewan, C., Diab, M., Li, X., Lin, X.V., et\u00a0al.: Opt: Open pre-trained transformer language models. ArXiv preprint arXiv::abs\/2205.01068 (2022)"},{"key":"919_CR70","unstructured":"Zhou, Z., Wei, X., Zhang, J., Sun, G.: $$\\{$$PetS$$\\}$$: A unified framework for $$\\{$$Parameter-Efficient$$\\}$$ transformers serving. In: 2022 USENIX Annual Technical Conference (USENIX ATC 22), pp. 489\u2013504 (2022)"},{"key":"919_CR71","unstructured":"Zhang, J., Wang, J., Li, H., Shou, L., Chen, K., You, Y., Xie, G., Gong, X., Zhou, K. (2025). Train Small, Infer Large: Memory-Efficient LoRA Training for Large Language Models. arXiv preprint arXiv:2502.13533. https:\/\/openreview.net\/forum?id=57K0pgkRxL"}],"container-title":["The VLDB Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-025-00919-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s00778-025-00919-7\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s00778-025-00919-7.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,9,6]],"date-time":"2025-09-06T14:47:07Z","timestamp":1757170027000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s00778-025-00919-7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,5,14]]},"references-count":71,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2025,7]]}},"alternative-id":["919"],"URL":"https:\/\/doi.org\/10.1007\/s00778-025-00919-7","relation":{},"ISSN":["1066-8888","0949-877X"],"issn-type":[{"type":"print","value":"1066-8888"},{"type":"electronic","value":"0949-877X"}],"subject":[],"published":{"date-parts":[[2025,5,14]]},"assertion":[{"value":"24 June 2024","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"26 January 2025","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 April 2025","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 May 2025","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}],"article-number":"43"}}