{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,8]],"date-time":"2026-04-08T16:47:57Z","timestamp":1775666877782,"version":"3.50.1"},"reference-count":91,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/creativecommons.org\/licenses\/by\/4.0\/legalcode"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["11901338"],"award-info":[{"award-number":["11901338"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["31970972"],"award-info":[{"award-number":["31970972"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Tsinghua University Initiative Scientific Research Program"},{"name":"Institute for interdisciplinary Information Core Technology"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/tpami.2021.3067100","type":"journal-article","created":{"date-parts":[[2021,3,18]],"date-time":"2021-03-18T19:57:43Z","timestamp":1616097463000},"page":"1-1","source":"Crossref","is-referenced-by-count":129,"title":["Self-Distillation: Towards Efficient and Compact Neural Networks"],"prefix":"10.1109","author":[{"given":"Linfeng","family":"Zhang","sequence":"first","affiliation":[]},{"given":"Chenglong","family":"Bao","sequence":"additional","affiliation":[]},{"given":"Kaisheng","family":"Ma","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1145\/3065386"},{"key":"ref2","first-page":"770","article-title":"Very deep convolutional networks for large-scale image recognition,","author":"He","journal-title":"Proc. IEEE Conf. Comput. Vis. Pattern Recognit"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.634"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.91"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.l007\/978-3-319-46448-0_2"},{"key":"ref7","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Ren"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2858826"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.322"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-24574-4_28"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v31i1.10510"},{"key":"ref12","first-page":"1","article-title":"Deep compression: Compressing deep neural network with pruning, trained quantization and huffman coding","volume-title":"Proc. 4th Int. Conf. Learn. Representations","author":"Han"},{"key":"ref13","first-page":"1135","article-title":"Learning both weights and connections for efficient neural network","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Han"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_12"},{"key":"ref15","first-page":"1","article-title":"Rethinking the value of network pruning","volume-title":"Proc. 7th Int. Conf. Learn. Representations","author":"Liu"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00020"},{"key":"ref17","first-page":"3123","article-title":"BinaryConnect: Training deep neural networks with binary weights during propagations","volume-title":"Proc. 28th Int. Conf. Neural Inf. Process. Syst.","author":"Courbariaux"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_32"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00141"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00040"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.48550\/ARXIV.1704.04861"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00474"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00682"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"ref25","article-title":"SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5 MB model size","author":"Iandola","year":"2016"},{"key":"ref26","first-page":"1269","article-title":"Exploiting linear structure within convolutional networks for efficient evaluation","volume-title":"Proc. 27th Int. Conf. Neural Inf. Process. Syst.","author":"Denton"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.5244\/C.28.88"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1145\/1150402.1150464"},{"key":"ref29","first-page":"1","article-title":"Distilling the knowledge in a neural network","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hinton"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00381"},{"key":"ref31","first-page":"4029","article-title":"SCAN: A scalable neural networks framework towards compact and efficient models","volume-title":"Proc. Annu. Conf. Neural Inf. Process. Syst.","author":"Zhang"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00489"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5866"},{"key":"ref35","first-page":"1","article-title":"FitNets: Hints for thin deep nets","volume-title":"Proc. 3rd Int. Conf. Learn. Representations","author":"Romero"},{"key":"ref36","first-page":"1","article-title":"Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer","volume-title":"Proc. 5th Int. Conf. Learn. Representations","author":"Zagoruyko"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.754"},{"key":"ref38","first-page":"141","article-title":"Graph-based knowledge distillation by multi-head attention network","volume-title":"Proc. 30th Brit. Mach. Vis. Conf.","author":"Lee"},{"key":"ref39","first-page":"775","article-title":"KDGAN: Knowledge distillation with generative adversarial networks","volume-title":"Proc. 32nd Int. Conf. Neural Inf. Process. Syst.","author":"Wang"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/IJCNN48605.2020.9207235"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33013771"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00409"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00145"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00143"},{"key":"ref45","first-page":"3030","article-title":"Learning what and where to transfer","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Jang"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00756"},{"key":"ref47","first-page":"1607","article-title":"Born again neural networks","volume-title":"Proc. 35th Int. Conf. Mach. Learn.","author":"Furlanello"},{"key":"ref48","article-title":"Label refinery: Improving imagenet classification through label progression","author":"Bagherinezhad","year":"2018"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00271"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.309"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2883743"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00683"},{"key":"ref53","first-page":"1","article-title":"Unifying distillation and privileged information","author":"Lopez-Paz","year":"2016","journal-title":"Proc. 4th Int. Conf. Learn. Representations"},{"key":"ref54","article-title":"Learning using privileged information: Similarity control and knowledge transfer","volume":"16","author":"Vapnik","year":"2015","journal-title":"J. Mach. Learn. Res."},{"key":"ref55","first-page":"1","article-title":"Over-parameterization as a catalyst for better generalization of deep ReLU network","volume":"abs\/1909.13458","author":"Tian","year":"2019","journal-title":"CoRR"},{"key":"ref56","first-page":"598","article-title":"Optimal brain damage","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"LeCun"},{"key":"ref57","first-page":"164","article-title":"Second order derivatives for network pruning: Optimal brain surgeon","volume-title":"Proc. Int. Conf. Neural Inf. Process. Syst.","author":"Hassibi"},{"key":"ref58","first-page":"1","article-title":"Learning sparse neural networks through L_0 regularization","volume-title":"Proc. 6th Int. Conf. Learn. Representations","author":"Louizos"},{"key":"ref59","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00339"},{"key":"ref60","first-page":"1","article-title":"The lottery ticket hypothesis: Finding sparse, trainable neural networks","volume-title":"Proc. 7th Int. Conf. Learn. Representations","author":"Frankle"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP49357.2023.10094626"},{"key":"ref62","first-page":"1","article-title":"Incremental network quantization: Towards lossless cnns with low-precision weights","volume-title":"Proc. 5th Int. Conf. Learn. Representations","author":"Zhou"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00141"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00140"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_48"},{"key":"ref66","first-page":"3301","article-title":"Shallow-deep networks: Understanding and mitigating network overthinking","volume-title":"Proc. 36th Int. Conf. Mach. Learn.","author":"Kaya"},{"key":"ref67","first-page":"1520","article-title":"Learning to stop while learning to predict","volume-title":"Proc. 37th Int. Conf. Mach. Learn.","author":"Chen"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01261-8_25"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00919"},{"key":"ref70","first-page":"1","article-title":"Multi-scale dense networks for resource efficient image classification","volume-title":"Proc. 6th Int. Conf. Learn. Representations","author":"Huang"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.243"},{"key":"ref72","first-page":"1","article-title":"Slimmable neural networks","volume-title":"Proc. 7th Int. Conf. Learn. Representations","author":"Yu"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00189"},{"key":"ref74","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"ref75","first-page":"1","article-title":"Unifying distillation and privileged information","volume-title":"Proc. 4th Int. Conf. Learn. Representations","author":"Lopez-Paz"},{"key":"ref76","first-page":"1","volume-title":"Learning Multiple Layers of Features From Tiny Images","author":"Krizhevsky","year":"2009"},{"key":"ref77","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.5244\/C.30.87"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00745"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPRW56347.2022.00309"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298801"},{"key":"ref82","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00936"},{"key":"ref83","article-title":"Autoaugment: Learning augmentation policies from data","author":"Cubuk","year":"2018"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00454"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00201"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.194"},{"key":"ref87","first-page":"147","article-title":"BAM: Bottleneck attention module","volume-title":"Proc. British Mach. Vis. Conf.","author":"Park"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01234-2_1"},{"key":"ref89","first-page":"1","article-title":"On large-batch training for deep learning: Generalization gap and sharp minima","volume-title":"Proc. 5th Int. Conf. Learn. Representations","author":"Keskar"},{"key":"ref90","doi-asserted-by":"publisher","DOI":"10.1007\/s10144-009-0162-4"},{"key":"ref91","first-page":"562","article-title":"Deeply-supervised nets","volume-title":"Proc. 18th Int. Conf. Artif. Intell. Statist.","author":"Lee"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/4359286\/09381661.pdf?arnumber=9381661","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,10]],"date-time":"2024-01-10T00:01:09Z","timestamp":1704844869000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9381661\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":91,"URL":"https:\/\/doi.org\/10.1109\/tpami.2021.3067100","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}