{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,10]],"date-time":"2026-05-10T18:24:09Z","timestamp":1778437449748,"version":"3.51.4"},"reference-count":112,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T00:00:00Z","timestamp":1736985600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T00:00:00Z","timestamp":1736985600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Nat Mach Intell"],"DOI":"10.1038\/s42256-024-00961-0","type":"journal-article","created":{"date-parts":[[2025,1,16]],"date-time":"2025-01-16T10:02:59Z","timestamp":1737021779000},"page":"6-17","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":57,"title":["Learning from models beyond fine-tuning"],"prefix":"10.1038","volume":"7","author":[{"ORCID":"https:\/\/orcid.org\/0000-0001-9077-1594","authenticated-orcid":false,"given":"Hongling","family":"Zheng","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5659-3464","authenticated-orcid":false,"given":"Li","family":"Shen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0576-8153","authenticated-orcid":false,"given":"Anke","family":"Tang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2296-6370","authenticated-orcid":false,"given":"Yong","family":"Luo","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7532-0496","authenticated-orcid":false,"given":"Han","family":"Hu","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-0059-8458","authenticated-orcid":false,"given":"Bo","family":"Du","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2751-5114","authenticated-orcid":false,"given":"Yonggang","family":"Wen","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2025,1,16]]},"reference":[{"key":"961_CR1","doi-asserted-by":"publisher","unstructured":"Bommasani, R. et al. On the opportunities and risks of foundation models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2108.07258 (2021).","DOI":"10.48550\/arXiv.2108.07258"},{"key":"961_CR2","unstructured":"Yuan, Y. On the power of foundation models. In Proc. 40th International Conference on Machine Learning 40519\u201340530 (PMLR, 2023)."},{"key":"961_CR3","doi-asserted-by":"crossref","unstructured":"Xu, R. et al. Knowledge conflicts for LLMs: a survey. In Proc. 2024 Conference Empirical Methods in Natural Language Processing 8541\u20138565 (ACL, 2024).","DOI":"10.18653\/v1\/2024.emnlp-main.486"},{"key":"961_CR4","doi-asserted-by":"crossref","unstructured":"Dupuis, E., Novo, D., O\u2019Connor, I. & Bosio, A. A heuristic exploration of retraining-free weight-sharing for CNN compression. In Proc. 27th Asia and South Pacific Design Automation Conference 134\u2013139 (IEEE, 2022).","DOI":"10.1109\/ASP-DAC52403.2022.9712487"},{"key":"961_CR5","doi-asserted-by":"publisher","unstructured":"Wei, J. et al. Emergent abilities of large language models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2206.07682 (2022). The emergent abilities of foundation models in various scenarios are discussed, providing a highly instructive overview of directions for future research.","DOI":"10.48550\/arXiv.2206.07682"},{"key":"961_CR6","doi-asserted-by":"publisher","first-page":"1789","DOI":"10.1007\/s11263-021-01453-z","volume":"129","author":"J Gou","year":"2021","unstructured":"Gou, J., Yu, B., Maybank, S. J. & Tao, D. Knowledge distillation: a survey. Int. J. Comput. Vis. 129, 1789\u20131819 (2021).","journal-title":"Int. J. Comput. Vis."},{"key":"961_CR7","first-page":"1","volume":"57","author":"S Wang","year":"2024","unstructured":"Wang, S. et al. Knowledge editing for large language models: a survey. ACM Comput. Surv. 57, 1\u201337 (2024).","journal-title":"ACM Comput. Surv."},{"key":"961_CR8","doi-asserted-by":"publisher","first-page":"220","DOI":"10.1038\/s42256-023-00626-4","volume":"5","author":"N Ding","year":"2023","unstructured":"Ding, N. et al. Parameter-efficient finetuning of large-scale pre-trained language models. Nat. Mach. Intell. 5, 220\u2013235 (2023).","journal-title":"Nat. Mach. Intell."},{"key":"961_CR9","doi-asserted-by":"publisher","unstructured":"Shen, T. et al. Large language model alignment: a survey. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2309.15025 (2023).","DOI":"10.48550\/arXiv.2309.15025"},{"key":"961_CR10","doi-asserted-by":"publisher","unstructured":"Zhao, W. X. et al. A survey of large language models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2303.18223 (2023).","DOI":"10.48550\/arXiv.2303.18223"},{"key":"961_CR11","doi-asserted-by":"publisher","first-page":"1299","DOI":"10.1109\/TMI.2016.2535302","volume":"35","author":"N Tajbakhsh","year":"2016","unstructured":"Tajbakhsh, N. et al. Convolutional neural networks for medical image analysis: full training or fine tuning? IEEE Trans. Med. Imag. 35, 1299\u20131312 (2016).","journal-title":"IEEE Trans. Med. Imag."},{"key":"961_CR12","unstructured":"Radford, A. et al. Improving language understanding by generative pre-training (2018). This paper is one of the studies that pioneered fine-tuning strategies in LLMs, demonstrating effectiveness across various natural language understanding benchmarks."},{"key":"961_CR13","unstructured":"Malladi, S., Wettig, A., Yu, D., Chen, D. & Arora, S. A kernel-based view of language model fine-tuning. In Proc. 40th International Conference on Machine Learning 23610\u201323641 (PMLR, 2023)."},{"key":"961_CR14","first-page":"53038","volume":"36","author":"S Malladi","year":"2023","unstructured":"Malladi, S. et al. Fine-tuning language models with just forward passes. Adv. Neural. Inf. Process. Syst. 36, 53038\u201353075 (2023).","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"961_CR15","doi-asserted-by":"crossref","unstructured":"Pfeiffer, J. et al. AdapterFusion: nondestructive task composition for transfer learning. In Proc. 16th Conference of the European Chapter of the Association for Computational Linguistics 487\u2013503 (ACL, 2021).","DOI":"10.18653\/v1\/2021.eacl-main.39"},{"key":"961_CR16","first-page":"1022","volume":"34","author":"R Karimi Mahabadi","year":"2021","unstructured":"Karimi Mahabadi, R., Henderson, J. & Ruder, S. Compacter: efficient low-rank hypercomplex adapter layers. Adv. Neural. Inf. Process. Syst. 34, 1022\u20131035 (2021).","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"961_CR17","doi-asserted-by":"crossref","unstructured":"Chen, Z.-C., Fu, C.-L., Liu, C.-Y., Li, S. W. D. & Lee, H.-Y. Exploring efficient-tuning methods in self-supervised speech models. In 2022 IEEE Spoken Language Technology Workshop 1120\u20131127 (IEEE, 2023).","DOI":"10.1109\/SLT54892.2023.10023274"},{"key":"961_CR18","doi-asserted-by":"crossref","unstructured":"Vander Eeckt, S. & Van Hamme, H. Using adapters to overcome catastrophic forgetting in end-to-end automatic speech recognition. In Proc. 2023 IEEE International Conference on Acoustics, Speech and Signal Processing 1\u20135 (IEEE, 2023).","DOI":"10.1109\/ICASSP49357.2023.10095837"},{"key":"961_CR19","first-page":"1877","volume":"33","author":"T Brown","year":"2020","unstructured":"Brown, T. et al. Language models are few-shot learners. Adv. Neural. Inf. Process. Syst. 33, 1877\u20131901 (2020).","journal-title":"Adv. Neural. Inf. Process. Syst."},{"key":"961_CR20","doi-asserted-by":"crossref","unstructured":"Gu, Y., Han, X., Liu, Z. & Huang, M. PPT: pre-trained prompt tuning for few-shot learning. In Proc. 60th Annual Meeting of the Association for Computational Linguistics 8410\u20138423 (ACL, 2022).","DOI":"10.18653\/v1\/2022.acl-long.576"},{"key":"961_CR21","doi-asserted-by":"crossref","unstructured":"Lester, B., Al-Rfou, R. & Constant, N. The power of scale for parameter-efficient prompt tuning. In Proc. 2021 Conference on Empirical Methods in Natural Language Processing 3045\u20133059 (ACL, 2021). This paper is a milestone work in parameter-efficient fine-tuning methods, introduced prompt tuning and notably reduced the fine-tuning cost of models while achieving competitive performance.","DOI":"10.18653\/v1\/2021.emnlp-main.243"},{"key":"961_CR22","unstructured":"Li, X. L. & Liang, P. Prefix-tuning: optimizing continuous prompts for generation. In Proc. 2021 Conference on Empirical Methods in Natural Language Processing 3045\u20133059 (ACL, 2021)."},{"key":"961_CR23","doi-asserted-by":"publisher","first-page":"182","DOI":"10.1016\/j.aiopen.2022.11.003","volume":"3","author":"X Han","year":"2022","unstructured":"Han, X., Zhao, W., Ding, N., Liu, Z. & Sun, M. PTR: prompt tuning with rules for text classification. AI Open 3, 182\u2013192 (2022).","journal-title":"AI Open"},{"key":"961_CR24","unstructured":"Sun, T., Shao, Y., Qian, H., Huang, X. & Qiu, X. Black-box tuning for language-model-as-a-service. In Proc. 39th International Conference on Machine Learning 20841\u201320855 (PMLR, 2022)."},{"key":"961_CR25","doi-asserted-by":"crossref","unstructured":"Sun, T. et al. BBTv2: towards a gradient-free future with large language models. In Proc. 2022 Conference on Empirical Methods in Natural Language Processing 3916\u20133930 (ACL, 2022).","DOI":"10.18653\/v1\/2022.emnlp-main.259"},{"key":"961_CR26","doi-asserted-by":"publisher","unstructured":"Diao, S., Li, X., Lin, Y., Huang, Z. & Zhang, T. Black-box prompt learning for pretrained language models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2201.08531 (2022).","DOI":"10.48550\/arXiv.2201.08531"},{"key":"961_CR27","doi-asserted-by":"publisher","unstructured":"Xiao, G., Lin, J. & Han, S. Offsite-tuning: transfer learning without full model. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2302.04870 (2023).","DOI":"10.48550\/arXiv.2302.04870"},{"key":"961_CR28","first-page":"34892","volume":"36","author":"H Liu","year":"2023","unstructured":"Liu, H., Li, C., Wu, Q. & Lee, Y. J. Visual instruction tuning. Adv. Neural Inf. Process. Syst. 36, 34892\u201334916 (2023).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR29","unstructured":"Longpre, S. et al. The FLAN collection: designing data and methods for effective instruction tuning. In Proc. 40th International Conference on Machine Learning 22631\u201322648 (PMLR, 2023)."},{"key":"961_CR30","doi-asserted-by":"publisher","unstructured":"Peng, B., Li, C., He, P., Galley, M. & Gao, J. Instruction tuning with GPT-4. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2304.03277 (2023).","DOI":"10.48550\/arXiv.2304.03277"},{"key":"961_CR31","unstructured":"Wei, J. et al. Finetuned language models are zero-shot learners. In Proc. 10th International Conference on Learning Representations 1\u201321 (OpenReview, 2022)."},{"key":"961_CR32","unstructured":"Sanh, V. et al. Multitask prompted training enables zero-shot task generalization. In Proc. 10th International Conference on Learning Representations 1\u201317 (OpenReview, 2022)."},{"key":"961_CR33","unstructured":"Shi, R., Liu, Y., Ze, Y., Du, S. S. & Xu, H. Unleashing the power of pre-trained language models for offline reinforcement learning. In Proc. 12th International Conference on Learning Representations 1\u201316 (OpenReview, 2024)."},{"key":"961_CR34","unstructured":"Wang, Z. et al. Multitask prompt tuning enables parameter-efficient transfer learning. In Proc. 11th International Conference on Learning Representations 1\u201315 (OpenReview, 2023)."},{"key":"961_CR35","doi-asserted-by":"publisher","unstructured":"Lee, J., Tang, R. & Lin, J. What would Elsa do? Freezing layers during transformer fine-tuning. Preprint at https:\/\/doi.org\/10.48550\/arXiv.1911.03090 (2019).","DOI":"10.48550\/arXiv.1911.03090"},{"key":"961_CR36","first-page":"36161","volume":"35","author":"J Wu","year":"2022","unstructured":"Wu, J. et al. Scaling multimodal pre\u2013training via cross-modality gradient harmonization. Adv. Neural Inf. Process. Syst. 35, 36161\u201336173 (2022).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR37","first-page":"6000","volume":"30","author":"A Vaswani","year":"2017","unstructured":"Vaswani, A. et al. Attention is all you need. Adv. Neural Inf. Process. Syst. 30, 6000\u20136010 (2017).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR38","unstructured":"Wen, Y., Tran, D. & Ba, J. BatchEnsemble: an alternative approach to efficient ensemble and lifelong learning. In Proc. International Conference on Learning Representations 1\u201313 (OpenReview, 2020)."},{"key":"961_CR39","doi-asserted-by":"crossref","unstructured":"Liu, T. Y. & Soatto, S. Tangent model composition for ensembling and continual fine-tuning. In Proc. IEEE\/CVF International Conference on Computer Vision 18676\u201318686 (IEEE, 2023).","DOI":"10.1109\/ICCV51070.2023.01712"},{"key":"961_CR40","unstructured":"Shazeer, N. et al. Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. In Proc. 5th International Conference on Learning Representations 1\u201312 (OpenReview, 2017). A pioneering work applying sparse mixture of experts to large-scale neural networks, this paper demonstrates substantial enhancements in model performance and capacity while maintaining impressive computational efficiency."},{"key":"961_CR41","first-page":"5232","volume":"23","author":"W Fedus","year":"2022","unstructured":"Fedus, W., Zoph, B. & Shazeer, N. Switch transformers: scaling to trillion parameter models with simple and efficient sparsity. J. Mach. Learn. Res. 23, 5232\u20135270 (2022).","journal-title":"J. Mach. Learn. Res."},{"key":"961_CR42","unstructured":"Du, N. et al. GLAM: efficient scaling of language models with mixture-of-experts. In Proc. 39th International Conference on Machine Learning 5547\u20135569 (PMLR, 2022)."},{"key":"961_CR43","doi-asserted-by":"crossref","unstructured":"Dai, D. et al. DeepSeekMoE: towards ultimate expert specialization in mixture-of-experts language models. In Proc. 62nd Annual Meeting of the Association for Computational Linguistics 1280\u20131297 (ACL, 2024).","DOI":"10.18653\/v1\/2024.acl-long.70"},{"key":"961_CR44","doi-asserted-by":"publisher","unstructured":"Jiang, A. Q. et al. Mixtral of experts. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2401.04088 (2024).","DOI":"10.48550\/arXiv.2401.04088"},{"key":"961_CR45","unstructured":"Yunis, D. et al. On convexity and linear mode connectivity in neural networks. In 16th International OPT Workshop on Optimization for Machine Learning 1\u20139 (OpenReview, 2022)."},{"key":"961_CR46","unstructured":"Wortsman, M. et al. Model soups: averaging weights of multiple fine-tuned models improves accuracy without increasing inference time. In Proc. 39th International Conference on Machine Learning 23965\u201323998 (PMLR, 2022)."},{"key":"961_CR47","doi-asserted-by":"crossref","unstructured":"Wortsman, M. et al. Robust fine-tuning of zero-shot models. In Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR) 7959\u20137971 (IEEE, 2022).","DOI":"10.1109\/CVPR52688.2022.00780"},{"key":"961_CR48","doi-asserted-by":"crossref","unstructured":"Lu, P. et al. Improving generalization of pre-trained language models via stochastic weight averaging. In Findings of the Association for Computational Linguistics: EMNLP 2022 4948\u20134954 (ACL, 2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.363"},{"key":"961_CR49","unstructured":"Kaddour, J. Stop wasting my time! Saving days of ImageNet and BERT training with latest weight averaging. In Has it Trained Yet? Workshop NeurIPS 2022 1\u20137 (OpenReview, 2022)."},{"key":"961_CR50","unstructured":"Sanyal, S., Neerkaje, A., Kaddour, J., Kumar, A., & Sanghavi, S. Early weight averaging meets high learning rates for llm pre-training. In Workshop on Advancing Neural Network Training: Computational Efficiency, Scalability, and Resource Optimization 1\u201313 (OpenReview, 2023)."},{"key":"961_CR51","unstructured":"Ilharco, G. et al. Editing models with task arithmetic. In Proc. 11th International Conference on Learning Representations 1\u201317 (OpenReview, 2023). This work introduces task vectors and their manipulation through simple arithmetic operations, enabling predictable model behaviour changes."},{"key":"961_CR52","unstructured":"Yang, E. et al. AdaMerging: adaptive model merging for multi-task learning. In Proc. 12th International Conference on Learning Representations 1\u201313 (OpenReview, 2024)."},{"key":"961_CR53","unstructured":"Ainsworth, S. K., Hayase, J. & Srinivasa, S. Git Re-Basin: merging models modulo permutation symmetries. In Proc. 11th International Conference on Learning Representations 1\u201318 (OpenReview, 2023)."},{"key":"961_CR54","unstructured":"Stoica, G. et al. ZipIt! Merging models from different tasks without training. In Proc. 12th International Conference on Learning Representations 1\u201313 (OpenReview, 2024)."},{"key":"961_CR55","first-page":"22045","volume":"33","author":"SP Singh","year":"2020","unstructured":"Singh, S. P. & Jaggi, M. Model fusion via optimal transport. Adv. Neural Inf. Process. Syst. 33, 22045\u201322055 (2020).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR56","unstructured":"Jin, X., Ren, X., Preotiuc-Pietro, D. & Cheng, P. Dataless knowledge fusion by merging weights of language models. In Proc. 11th International Conference on Learning Representations 1\u201313 (OpenReview, 2023)."},{"key":"961_CR57","doi-asserted-by":"publisher","first-page":"3002","DOI":"10.1109\/TMM.2020.2966885","volume":"22","author":"Y Lou","year":"2020","unstructured":"Lou, Y. et al. Towards efficient front-end visual sensing for digital retina: a model-centric paradigm. IEEE Trans. Multimed. 22, 3002\u20133013 (2020).","journal-title":"IEEE Trans. Multimed."},{"key":"961_CR58","first-page":"17703","volume":"35","author":"MS Matena","year":"2022","unstructured":"Matena, M. S. & Raffel, C. A. Merging models with Fisher-weighted averaging. Adv. Neural Inf. Process. Syst. 35, 17703\u201317716 (2022).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR59","doi-asserted-by":"publisher","first-page":"13344","DOI":"10.1109\/TPAMI.2023.3292075","volume":"45","author":"Z Zhu","year":"2023","unstructured":"Zhu, Z., Lin, K., Jain, A. K. & Zhou, J. Transfer learning in deep reinforcement learning: a survey. IEEE Trans. Pattern Anal. Mach. Intell. 45, 13344\u201313362 (2023).","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"961_CR60","unstructured":"Pillutla, K. et al. Federated learning with partial model personalization. In Proc. 39th International Conference on Machine Learning 17716\u201317758 (PMLR, 2022)."},{"key":"961_CR61","doi-asserted-by":"crossref","unstructured":"Khattab, O. & Zaharia, M. ColBERT: efficient and effective passage search via contextualized late interaction over BERT. In Proc. 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval 39\u201348 (ACM, 2020).","DOI":"10.1145\/3397271.3401075"},{"key":"961_CR62","doi-asserted-by":"crossref","unstructured":"Xin, Y., Du, J., Wang, Q., Yan, K. & Ding, S. MMap: multi-modal alignment prompt for cross-domain multi-task learning. In Proc. AAAI Conference on Artificial Intelligence Vol. 38, 16076\u201316084 (AAAI, 2024).","DOI":"10.1609\/aaai.v38i14.29540"},{"key":"961_CR63","doi-asserted-by":"publisher","first-page":"90","DOI":"10.1016\/j.cobeha.2021.01.002","volume":"38","author":"JX Wang","year":"2021","unstructured":"Wang, J. X. Meta-learning in natural and artificial intelligence. Curr. Opin. Behav. Sci. 38, 90\u201395 (2021).","journal-title":"Curr. Opin. Behav. Sci."},{"key":"961_CR64","unstructured":"Wang, Z. et al. Meta-learning without data via wasserstein distributionally-robust model fusion. In Proc. 38th Conference on Uncertainty in Artificial Intelligence 2045\u20132055 (PMLR, 2022)."},{"key":"961_CR65","unstructured":"Finn, C., Abbeel, P. & Levine, S. Model-agnostic meta-learning for fast adaptation of deep networks. In Proc. 34th International Conference on Machine Learning 1126\u20131135 (PMLR, 2017)."},{"key":"961_CR66","first-page":"16532","volume":"33","author":"M Caccia","year":"2020","unstructured":"Caccia, M. et al. Online fast adaptation and knowledge accumulation (OSAKA): a new approach to continual learning. Adv. Neural Inf. Process. Syst. 33, 16532\u201316545 (2020).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR67","doi-asserted-by":"crossref","unstructured":"Sun, Q., Liu, Y., Chua, T.-S. & Schiele, B. Meta-transfer learning for few-shot learning. In Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition 403\u2013412 (IEEE, 2019).","DOI":"10.1109\/CVPR.2019.00049"},{"key":"961_CR68","doi-asserted-by":"publisher","unstructured":"Zhang, G., Luo, Z., Cui, K. & Lu, S. Meta-DETR: few-shot object detection via unified image-level meta-learning. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2103.11731 (2021).","DOI":"10.48550\/arXiv.2103.11731"},{"key":"961_CR69","doi-asserted-by":"publisher","unstructured":"Nichol, A., Achiam, J. & Schulman, J. On first-order meta-learning algorithms. Preprint at https:\/\/doi.org\/10.48550\/arXiv.1803.02999 (2018). This study presents one of the earliest methods for simplifying meta-learning implementation through effective parameter initialization, enhancing the flexibility and applicability of foundational models in real-world scenarios.","DOI":"10.48550\/arXiv.1803.02999"},{"key":"961_CR70","unstructured":"Hu, Z. et al. Learning to learn from APIs: black-box data-free meta-learning. In Proc. 40th International Conference on Machine Learning 13610\u201313627 (PMLR, 2023)."},{"key":"961_CR71","doi-asserted-by":"crossref","unstructured":"Hu, Z. et al. Architecture, dataset and model-scale agnostic data-free meta-learning. In Proc. IEEE\/CVF Conference on Computer Vision and Pattern Recognition 7736\u20137745 (IEEE, 2023).","DOI":"10.1109\/CVPR52729.2023.00747"},{"key":"961_CR72","doi-asserted-by":"crossref","unstructured":"Xu, K. et al. Efficient joint optimization of layer-adaptive weight pruning in deep neural networks. In Proc. IEEE\/CVF International Conference on Computer Vision 17447\u201317457 (IEEE, 2023).","DOI":"10.1109\/ICCV51070.2023.01600"},{"key":"961_CR73","doi-asserted-by":"crossref","unstructured":"Tan, Y. et al. Federated learning on non-iid graphs via structural knowledge sharing. In Proc. AAAI Conference on Artificial Intelligence Vol. 37, 9953\u20139961 (AAAI, 2023).","DOI":"10.1609\/aaai.v37i8.26187"},{"key":"961_CR74","doi-asserted-by":"publisher","first-page":"5443","DOI":"10.1109\/TSP.2022.3222734","volume":"70","author":"X Zhang","year":"2022","unstructured":"Zhang, X., Hu, C., He, B. & Han, Z. Distributed reptile algorithm for meta-learning over multi-agent systems. IEEE Trans. Signal Process. 70, 5443\u20135456 (2022).","journal-title":"IEEE Trans. Signal Process."},{"key":"961_CR75","first-page":"37647","volume":"35","author":"X Song","year":"2022","unstructured":"Song, X., Zheng, S., Cao, W., Yu, J. & Bian, J. Efficient and effective multi-task grouping via meta learning on task combinations. Adv. Neural Inf. Process. Syst. 35, 37647\u201337659 (2022).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR76","doi-asserted-by":"crossref","unstructured":"Yao, Y. et al. Editing large language models: problems, methods, and opportunities. In Proc. 2023 Conference on Empirical Methods in Natural Language Processing 10222\u201310240 (ACL, 2023).","DOI":"10.18653\/v1\/2023.emnlp-main.632"},{"key":"961_CR77","unstructured":"Mitchell, E., Lin, C., Bosselut, A., Manning, C. D. & Finn, C. Memory-based model editing at scale. In Proc. 39th International Conference on Machine Learning 15817\u201315831 (PMLR, 2022). This seminal work enables effective behaviour editing without modifying the parameters of foundational models while minimizing potential degradation from multiple edits, providing a practical solution for long-term maintenance and updates."},{"key":"961_CR78","unstructured":"Huang, Z. et al. Transformer-Patcher: one mistake worth one neuron. In Proc. 11th International Conference on Learning Representations 1\u201312 (OpenReview, 2023)."},{"key":"961_CR79","doi-asserted-by":"crossref","unstructured":"Dong, Q. et al. Calibrating factual knowledge in pretrained language models. In Findings of the Association for Computational Linguistics: EMNLP 2022 5937\u20135947 (ACL, 2022).","DOI":"10.18653\/v1\/2022.findings-emnlp.438"},{"key":"961_CR80","doi-asserted-by":"publisher","unstructured":"Zhu, C. et al. Modifying memories in transformer models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2012.00363 (2020).","DOI":"10.48550\/arXiv.2012.00363"},{"key":"961_CR81","first-page":"17359","volume":"35","author":"K Meng","year":"2022","unstructured":"Meng, K., Bau, D., Andonian, A. & Belinkov, Y. Locating and editing factual associations in GPT. Adv. Neural Inf. Process. Syst. 35, 17359\u201317372 (2022). An important study that locates and precisely edits specific factual knowledge within the complex architecture of foundation models offers insights into model editing.","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR82","unstructured":"Mitchell, E., Lin, C., Bosselut, A., Finn, C. & Manning, C. D. Fast model editing at scale. In Proc. 10th International Conference on Learning Representations 1\u201314 (OpenReview, 2022)."},{"key":"961_CR83","unstructured":"Tan, C., Zhang, G. & Fu, J. Massive editing for large language models via meta learning. In Proc. 12th International Conference on Learning Representations 1\u201312 (OpenReview, 2024)."},{"key":"961_CR84","doi-asserted-by":"publisher","first-page":"110826","DOI":"10.1016\/j.knosys.2023.110826","volume":"279","author":"X Han","year":"2023","unstructured":"Han, X., Li, R., Li, X. & Pan, J. Z. A divide and conquer framework for knowledge editing. Knowl. Based Syst. 279, 110826 (2023).","journal-title":"Knowl. Based Syst."},{"key":"961_CR85","doi-asserted-by":"publisher","first-page":"250","DOI":"10.1007\/s10462-024-10862-8","volume":"57","author":"VK Chauhan","year":"2024","unstructured":"Chauhan, V. K., Zhou, J., Lu, P., Molaei, S. & Clifton, D. A. A brief review of hypernetworks in deep learning. Artif. Intell. Rev. 57, 250 (2024).","journal-title":"Artif. Intell. Rev."},{"key":"961_CR86","doi-asserted-by":"publisher","unstructured":"Liu, X. et al. Large language models and causal inference in collaboration: a comprehensive survey. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2403.09606 (2024).","DOI":"10.48550\/arXiv.2403.09606"},{"key":"961_CR87","doi-asserted-by":"publisher","first-page":"423","DOI":"10.1162\/tacl_a_00324","volume":"8","author":"Z Jiang","year":"2020","unstructured":"Jiang, Z., Xu, F. F., Araki, J. & Neubig, G. How can we know what language models know? Trans. Assoc. Comput. Linguist. 8, 423\u2013438 (2020).","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"961_CR88","doi-asserted-by":"crossref","unstructured":"Cohen, R., Geva, M., Berant, J. & Globerson, A. Crawling the internal knowledge-base of language models. In Findings of the Association for Computational Linguistics: EACL 2023 1856\u20131869 (ACL, 2023).","DOI":"10.18653\/v1\/2023.findings-eacl.139"},{"key":"961_CR89","first-page":"4302","volume":"30","author":"PF Christiano","year":"2017","unstructured":"Christiano, P. F. et al. Deep reinforcement learning from human preferences. Adv. Neural Inf. Process. Syst. 30, 4302\u20134310 (2017).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR90","doi-asserted-by":"publisher","unstructured":"Glaese, A. et al. Improving alignment of dialogue agents via targeted human judgements. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2209.14375 (2022). The authors present a method that uses dialogue as the medium for response generation to address harmful issues in reinforcement learning strategy distribution optimization.","DOI":"10.48550\/arXiv.2209.14375"},{"key":"961_CR91","unstructured":"Go, D. et al. Aligning language models with preferences through f-divergence minimization. In Proc. 40th International Conference on Machine Learning 11546\u201311583 (PMLR, 2023)."},{"key":"961_CR92","first-page":"181","volume":"35","author":"R Liu","year":"2022","unstructured":"Liu, R. et al. Second thoughts are best: learning to re-align with human values from text edits. Adv. Neural Inf. Process. Syst. 35, 181\u2013196 (2022).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR93","doi-asserted-by":"crossref","unstructured":"Kim, S. et al. Aligning large language models through synthetic feedback. In Proc. 2023 Conference on Empirical Methods in Natural Language Processing 13677\u201313700 (ACL, 2023).","DOI":"10.18653\/v1\/2023.emnlp-main.844"},{"key":"961_CR94","first-page":"62630","volume":"36","author":"Z Li","year":"2024","unstructured":"Li, Z. et al. Guiding large language models via directional stimulus prompting. Adv. Neural Inf. Process. Syst. 36, 62630\u201362656 (2024).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR95","doi-asserted-by":"crossref","unstructured":"Aky\u00fcrek, A. F. et al. RL4F: generating natural language feedback with reinforcement learning for repairing model outputs. In Proc. 61st Annual Meeting of the Association for Computational Linguistics 7716\u20137733 (ACL, 2023).","DOI":"10.18653\/v1\/2023.acl-long.427"},{"key":"961_CR96","doi-asserted-by":"publisher","unstructured":"Thoppilan, R. et al. LaMDA: language models for dialog applications. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2201.08239 (2022).","DOI":"10.48550\/arXiv.2201.08239"},{"key":"961_CR97","unstructured":"Zhao, Y. et al. Calibrating sequence likelihood improves conditional language generation. In Proc. 11th International Conference on Learning Representations 1\u201314 (OpenReview, 2023)."},{"key":"961_CR98","doi-asserted-by":"crossref","unstructured":"Song, F. et al. Preference ranking optimization for human alignment. In Proc. AAAI Conference on Artificial Intelligence Vol. 38, 18990\u201318998 (AAAI, 2024).","DOI":"10.1609\/aaai.v38i17.29865"},{"key":"961_CR99","first-page":"51719","volume":"36","author":"Z Chen","year":"2024","unstructured":"Chen, Z. et al. Content-based unrestricted adversarial attack. Adv. Neural Inf. Process. Syst. 36, 51719\u201351733 (2024).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR100","doi-asserted-by":"publisher","first-page":"18","DOI":"10.1016\/j.neunet.2024.106230","volume":"174","author":"Y Fang","year":"2024","unstructured":"Fang, Y., Yap, P.-T., Lin, W., Zhu, H. & Liu, M. Source-free unsupervised domain adaptation: a survey. Neural Netw. 174, 18 (2024).","journal-title":"Neural Netw."},{"key":"961_CR101","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Jia, R., & Liang, P. Know what you don\u2019t know: unanswerable questions for SQuAD. In Proc. 56th Annual Meeting of the Association for Computational Linguistics 784\u2013789 (ACL, 2018).","DOI":"10.18653\/v1\/P18-2124"},{"key":"961_CR102","doi-asserted-by":"crossref","unstructured":"Williams, A., Nangia, N., & Bowman, S. A broad-coverage challenge corpus for sentence understanding through inference. In Proc. 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies 1112\u20131122 (ACL, 2024).","DOI":"10.18653\/v1\/N18-1101"},{"key":"961_CR103","doi-asserted-by":"crossref","unstructured":"Wang, A. et al. GLUE: a multi-task benchmark and analysis platform for natural language understanding. In Proc. 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP 353\u2013355 (ACL, 2018).","DOI":"10.18653\/v1\/W18-5446"},{"key":"961_CR104","doi-asserted-by":"crossref","unstructured":"Deng, J. et al. ImageNet: a large-scale hierarchical image database. In Proc. 2009 IEEE Conference on Computer Vision and Pattern Recognition 248\u2013255 (IEEE, 2009).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"961_CR105","doi-asserted-by":"publisher","first-page":"335","DOI":"10.1007\/s10579-008-9076-6","volume":"42","author":"C Busso","year":"2008","unstructured":"Busso, C. et al. IEMOCAP: interactive emotional dyadic motion capture database. Lang. Res. Eval. 42, 335\u2013359 (2008).","journal-title":"Lang. Res. Eval."},{"key":"961_CR106","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y. et al. Microsoft COCO: common objects in context. In Proc. 13th European Conference on Computer Vision 740\u2013755 (Springer, 2014).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"961_CR107","first-page":"36479","volume":"35","author":"C Saharia","year":"2022","unstructured":"Saharia, C. et al. Photorealistic text-to-image diffusion models with deep language understanding. Adv. Neural Inf. Process. Syst. 35, 36479\u201336494 (2022).","journal-title":"Adv. Neural Inf. Process. Syst."},{"key":"961_CR108","unstructured":"Radford, A. et al. Learning transferable visual models from natural language supervision. In Proc. 38th International Conference on Machine Learning 8748\u20138763 (PMLR, 2021)."},{"key":"961_CR109","doi-asserted-by":"crossref","unstructured":"Kirillov, A. et al. Segment anything. In Proc. IEEE\/CVF International Conference on Computer Vision (ICCV 2023) 4015\u20134026 (IEEE, 2023).","DOI":"10.1109\/ICCV51070.2023.00371"},{"key":"961_CR110","doi-asserted-by":"publisher","unstructured":"Gemini Team, Google. Gemini: a family of highly capable multimodal models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2312.11805 (2023).","DOI":"10.48550\/arXiv.2312.11805"},{"key":"961_CR111","doi-asserted-by":"publisher","unstructured":"OpenAI. GPT-4o System Card. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2410.21276 (2024).","DOI":"10.48550\/arXiv.2410.21276"},{"key":"961_CR112","doi-asserted-by":"publisher","unstructured":"Touvron, H. et al. Llama: open and efficient foundation language models. Preprint at https:\/\/doi.org\/10.48550\/arXiv.2302.13971 (2023).","DOI":"10.48550\/arXiv.2302.13971"}],"container-title":["Nature Machine Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00961-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00961-0","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00961-0.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,1,26]],"date-time":"2025-01-26T23:04:03Z","timestamp":1737932643000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.nature.com\/articles\/s42256-024-00961-0"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2025,1,16]]},"references-count":112,"journal-issue":{"issue":"1","published-online":{"date-parts":[[2025,1]]}},"alternative-id":["961"],"URL":"https:\/\/doi.org\/10.1038\/s42256-024-00961-0","relation":{},"ISSN":["2522-5839"],"issn-type":[{"value":"2522-5839","type":"electronic"}],"subject":[],"published":{"date-parts":[[2025,1,16]]},"assertion":[{"value":"19 October 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"20 November 2024","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"16 January 2025","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"The authors declare no competing interests.","order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Competing interests"}}]}}