{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T05:00:45Z","timestamp":1775278845926,"version":"3.50.1"},"reference-count":29,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"5","license":[{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T00:00:00Z","timestamp":1777593600000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"Israeli Council of Higher Education, AFOSR","award":["FA8655-24-1-7006"],"award-info":[{"award-number":["FA8655-24-1-7006"]}]},{"DOI":"10.13039\/501100000324","name":"Gatsby Charitable Foundation","doi-asserted-by":"publisher","id":[{"id":"10.13039\/501100000324","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2026,5]]},"DOI":"10.1109\/tpami.2025.3647862","type":"journal-article","created":{"date-parts":[[2025,12,24]],"date-time":"2025-12-24T18:45:42Z","timestamp":1766601942000},"page":"5004-5015","source":"Crossref","is-referenced-by-count":0,"title":["Forget Me Not: Fighting Local Overfitting With Knowledge Fusion and Distillation"],"prefix":"10.1109","volume":"48","author":[{"ORCID":"https:\/\/orcid.org\/0009-0003-5083-7001","authenticated-orcid":false,"given":"Uri","family":"Stern","sequence":"first","affiliation":[{"name":"School of Computer Science, Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0009-0006-9766-8892","authenticated-orcid":false,"given":"Eli","family":"Corn","sequence":"additional","affiliation":[{"name":"School of Computer Science, Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8893-8586","authenticated-orcid":false,"given":"Daphna","family":"Weinshall","sequence":"additional","affiliation":[{"name":"School of Computer Science, Engineering, The Hebrew University of Jerusalem, Jerusalem, Israel"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR52688.2022.01167"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1007\/s10489-021-02199-4"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v39i19.34269"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1016\/s0079-7421(08)60536-8"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1007\/s10462-022-10283-5"},{"issue":"1","key":"ref6","first-page":"1929","article-title":"DropOut: A simple way to prevent neural networks from overfitting","volume":"15","author":"Srivastava","year":"2014","journal-title":"J. Mach. Learn. Res."},{"key":"ref7","article-title":"Horizontal and vertical ensemble with deep representation for classification","author":"Xie","year":"2013"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1137\/0330046"},{"key":"ref9","article-title":"Averaging weights leads to wider optima and better generalization","author":"Izmailov","year":"2018"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ACCESS.2022.3180844"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.3390\/app13052935"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1073\/pnas.1903070116"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1088\/1742-5468\/ac3a74"},{"key":"ref14","article-title":"When and how epochwise double descent happens","author":"Stephenson","year":"2021"},{"key":"ref15","article-title":"Early stopping in deep networks: Double descent and how to eliminate it","author":"Heckel","year":"2020"},{"key":"ref16","article-title":"Distilling the knowledge in a neural network","author":"Hinton","year":"2015"},{"key":"ref17","article-title":"Rethinking Self-Distillation: Label Averaging and Enhanced Soft Label Refinement with Partial Labels","volume-title":"Proc. 13th Int. Conf. Learn. Representations (ICLR)","author":"Jeong","year":"2025"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v38i13.29429"},{"key":"ref19","article-title":"Towards understanding ensemble, knowledge distillation and self-distillation in deep learning","author":"Allen-Zhu","year":"2020"},{"key":"ref20","article-title":"Revisiting self-distillation","author":"Pham","year":"2022"},{"key":"ref21","first-page":"7102","article-title":"Understanding self-distillation in the presence of label noise","volume-title":"Proc. Int. Conf. Mach. Learn.","author":"Das","year":"2023"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.52202\/068431-0044"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICPR56361.2022.9956511"},{"key":"ref24","article-title":"Snapshot ensembles: Train 1, get m for free","author":"Huang","year":"2017"},{"key":"ref25","first-page":"19365","article-title":"Self-adaptive training: Beyond empirical risk minimization","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Huang","year":"2020"},{"key":"ref26","first-page":"8803","article-title":"Loss surfaces, mode connectivity, and fast ensembling of DNNs","volume-title":"Proc. Adv. Neural Inf. Process. Syst.","author":"Garipov","year":"2018"},{"key":"ref27","article-title":"An image is worth 16 \u00d7 16 words: Transformers for image recognition at scale","author":"Dosovitskiy","year":"2020"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-031-20053-3_27"},{"issue":"155","key":"ref29","first-page":"1","article-title":"Principal components bias in over-parameterized linear models, and its manifestation in deep neural networks","volume":"23","author":"Hacohen","year":"2022","journal-title":"J. Mach. Learn. Res."}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx8\/34\/11474534\/11314536.pdf?arnumber=11314536","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,4]],"date-time":"2026-04-04T04:16:16Z","timestamp":1775276176000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/11314536\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2026,5]]},"references-count":29,"journal-issue":{"issue":"5"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2025.3647862","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2026,5]]}}}