{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,17]],"date-time":"2026-04-17T21:49:09Z","timestamp":1776462549348,"version":"3.51.2"},"reference-count":350,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2021,3,22]],"date-time":"2021-03-22T00:00:00Z","timestamp":1616371200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,3,22]],"date-time":"2021-03-22T00:00:00Z","timestamp":1616371200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2021,6]]},"DOI":"10.1007\/s11263-021-01453-z","type":"journal-article","created":{"date-parts":[[2021,3,22]],"date-time":"2021-03-22T17:03:31Z","timestamp":1616432611000},"page":"1789-1819","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":3046,"title":["Knowledge Distillation: A Survey"],"prefix":"10.1007","volume":"129","author":[{"given":"Jianping","family":"Gou","sequence":"first","affiliation":[]},{"given":"Baosheng","family":"Yu","sequence":"additional","affiliation":[]},{"given":"Stephen J.","family":"Maybank","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7225-5449","authenticated-orcid":false,"given":"Dacheng","family":"Tao","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2021,3,22]]},"reference":[{"key":"1453_CR1","doi-asserted-by":"crossref","unstructured":"Aditya, S., Saha, R., Yang, Y., & Baral, C. (2019). Spatial knowledge distillation to aid visual reasoning. In WACV.","DOI":"10.1109\/WACV.2019.00030"},{"key":"1453_CR2","doi-asserted-by":"crossref","unstructured":"Aguilar, G., Ling, Y., Zhang, Y., Yao, B., Fan, X., & Guo, E. (2020). Knowledge distillation from internal representations. In AAAI.","DOI":"10.1609\/aaai.v34i05.6229"},{"key":"1453_CR3","unstructured":"Aguinaldo, A., Chiang, P. Y., Gain, A., Patil, A., Pearson, K., & Feizi, S. (2019). Compressing gans using knowledge distillation. arXiv preprint arXiv:1902.00159."},{"key":"1453_CR4","doi-asserted-by":"crossref","unstructured":"Ahn, S., Hu, S., Damianou, A., Lawrence, N. D., & Dai, Z. (2019). Variational information distillation for knowledge transfer. In CVPR.","DOI":"10.1109\/CVPR.2019.00938"},{"key":"1453_CR5","doi-asserted-by":"crossref","unstructured":"Albanie, S., Nagrani, A., Vedaldi, A., & Zisserman, A. (2018). Emotion recognition in speech using cross-modal transfer in the wild. In ACM MM.","DOI":"10.1145\/3240508.3240578"},{"key":"1453_CR6","unstructured":"Allen-Zhu, Z., Li, Y., & Liang, Y. (2019). Learning and generalization in overparameterized neural networks, going beyond two layers. In NeurIPS."},{"key":"1453_CR7","unstructured":"Anil, R., Pereyra, G., Passos, A., Ormandi, R., Dahl, G. E., & Hinton, G. E. (2018). Large scale distributed neural network training through online distillation. In ICLR."},{"key":"1453_CR8","unstructured":"Arora, S., Cohen, N., & Hazan, E. (2018). On the optimization of deep networks: Implicit acceleration by overparameterization. In ICML."},{"key":"1453_CR9","doi-asserted-by":"crossref","unstructured":"Arora, S., Khapra, M. M., & Ramaswamy, H. G. (2019). On knowledge distillation from complex networks for response prediction. In NAACL-HLT.","DOI":"10.18653\/v1\/N19-1382"},{"key":"1453_CR10","doi-asserted-by":"crossref","unstructured":"Asami, T., Masumura, R., Yamaguchi, Y., Masataki, H., & Aono, Y. (2017). Domain adaptation of dnn acoustic models using knowledge distillation. In ICASSP.","DOI":"10.1109\/ICASSP.2017.7953145"},{"key":"1453_CR11","unstructured":"Ashok, A., Rhinehart, N., Beainy, F., & Kitani, K. M. (2018). N2N learning: Network to network compression via policy gradient reinforcement learning. In ICLR."},{"key":"1453_CR12","unstructured":"Asif, U., Tang, J. & Harrer, S. (2020). Ensemble knowledge distillation for learning improved and efficient networks. In ECAI."},{"key":"1453_CR13","unstructured":"Ba, J., & Caruana, R. (2014). Do deep nets really need to be deep? In NeurIPS."},{"key":"1453_CR14","unstructured":"Bagherinezhad, H., Horton, M., Rastegari, M., & Farhadi, A. (2018). Label refinery: Improving imagenet classification through label progression. arXiv preprint arXiv:1805.02641."},{"key":"1453_CR15","doi-asserted-by":"crossref","unstructured":"Bai, H., Wu, J., King, I., & Lyu, M. (2020). Few shot network compression via cross distillation. In AAAI.","DOI":"10.1609\/aaai.v34i04.5718"},{"key":"1453_CR16","doi-asserted-by":"crossref","unstructured":"Bai, Y., Yi, J., Tao, J., Tian, Z., & Wen, Z. (2019). Learn spelling from teachers: transferring knowledge from language models to sequence-to-sequence speech recognition. In Interspeech.","DOI":"10.21437\/Interspeech.2019-1554"},{"key":"1453_CR17","doi-asserted-by":"crossref","unstructured":"Bashivan, P., Tensen, M., & DiCarlo, J. J. (2019). Teacher guided architecture search. In ICCV.","DOI":"10.1109\/ICCV.2019.00542"},{"key":"1453_CR18","unstructured":"Belagiannis, V., Farshad, A., & Galasso, F. (2018). Adversarial network compression. In ECCV."},{"issue":"8","key":"1453_CR19","doi-asserted-by":"publisher","first-page":"1798","DOI":"10.1109\/TPAMI.2013.50","volume":"35","author":"Y Bengio","year":"2013","unstructured":"Bengio, Y., Courville, A., & Vincent, P. (2013). Representation learning: A review and new perspectives. IEEE TPAMI, 35(8), 1798\u20131828.","journal-title":"IEEE TPAMI"},{"key":"1453_CR20","doi-asserted-by":"crossref","unstructured":"Bergmann, P., Fauser, M., Sattlegger, D., & Steger, C. (2020). Uninformed students: Student-teacher anomaly detection with discriminative latent embeddings. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00424"},{"key":"1453_CR21","doi-asserted-by":"crossref","unstructured":"Bhardwaj, S., Srinivasan, M., & Khapra, M. M. (2019). Efficient video classification using fewer frames. In CVPR.","DOI":"10.1109\/CVPR.2019.00044"},{"key":"1453_CR22","unstructured":"Bistritz, I., Mann, A., & Bambos, N. (2020). Distributed Distillation for On-Device Learning. In NeurIPS."},{"key":"1453_CR23","unstructured":"Bohdal, O., Yang, Y., & Hospedales, T. (2020). Flexible Dataset Distillation: Learn Labels Instead of Images. arXiv preprint arXiv:2006.08572."},{"key":"1453_CR24","doi-asserted-by":"crossref","unstructured":"Boo, Y., Shin, S., Choi, J., & Sung, W. (2021). Stochastic precision ensemble: self-knowledge distillation for quantized deep neural networks. In AAAI.","DOI":"10.1609\/aaai.v35i8.16839"},{"key":"1453_CR25","unstructured":"Brutzkus, A., & Globerson, A. (2019). Why do Larger Models Generalize Better? A Theoretical Perspective via the XOR Problem. In ICML."},{"key":"1453_CR26","doi-asserted-by":"crossref","unstructured":"Bucilua, C., Caruana, R. & Niculescu-Mizil, A. (2006). Model compression. In SIGKDD.","DOI":"10.1145\/1150402.1150464"},{"key":"1453_CR27","unstructured":"Caccia, M., Rodriguez, P., Ostapenko, O., Normandin, F., Lin, M., Caccia, L., Laradji, I., Rish, I., Lacoste, A., Vazquez D., & Charlin, L. (2020). Online Fast Adaptation and Knowledge Accumulation (OSAKA): a New Approach to Continual Learning. In NeurIPS."},{"key":"1453_CR28","unstructured":"Chan, W., Ke, N. R., & Lane, I. (2015). Transferring knowledge from a RNN to a DNN. arXiv preprint arXiv:1504.01483."},{"key":"1453_CR29","doi-asserted-by":"crossref","unstructured":"Chawla, A., Yin, H., Molchanov, P., & Alvarez, J. (2021). Data-Free Knowledge Distillation for Object Detection. In WACV.","DOI":"10.1109\/WACV48630.2021.00333"},{"key":"1453_CR30","doi-asserted-by":"crossref","unstructured":"Chebotar, Y. & Waters, A. (2016). Distilling knowledge from ensembles of neural networks for speech recognition. In Interspeech.","DOI":"10.21437\/Interspeech.2016-1190"},{"key":"1453_CR31","doi-asserted-by":"crossref","unstructured":"Chen, D., Mei, J. P., Wang, C., Feng, Y. & Chen, C. (2020a). Online knowledge distillation with diverse peers. In AAAI.","DOI":"10.1609\/aaai.v34i04.5746"},{"key":"1453_CR32","doi-asserted-by":"crossref","unstructured":"Chen, D., Mei, J. P., Zhang, Y., Wang, C., Wang, Z., Feng, Y., & Chen, C. (2021). Cross-layer distillation with semantic calibration. In AAAI.","DOI":"10.1609\/aaai.v35i8.16865"},{"key":"1453_CR33","unstructured":"Chen, G., Choi, W., Yu, X., Han, T., & Chandraker, M. (2017). Learning efficient object detection models with knowledge distillation. In NeurIPS."},{"key":"1453_CR34","doi-asserted-by":"crossref","unstructured":"Chen, H., Wang, Y., Xu, C., Yang, Z., Liu, C., Shi, B., Xu, C., Xu, C.,&Tian, Q. (2019a). Data-free learning of student networks. In ICCV.","DOI":"10.1109\/ICCV.2019.00361"},{"issue":"1","key":"1453_CR35","first-page":"25","volume":"32","author":"H Chen","year":"2021","unstructured":"Chen, H., Wang, Y., Xu, C., Xu, C., & Tao, D. (2021). Learning student networks via feature embedding. IEEE TNNLS, 32(1), 25\u201335.","journal-title":"IEEE TNNLS"},{"key":"1453_CR36","unstructured":"Chen, T., Goodfellow, I. & Shlens, J. (2016). Net2net: Accelerating learning via knowledge transfer. In ICLR."},{"key":"1453_CR37","doi-asserted-by":"crossref","unstructured":"Chen, W. C., Chang, C. C. & Lee, C. R. (2018a). Knowledge distillation with feature maps for image classification. In ACCV.","DOI":"10.1007\/978-3-030-20893-6_13"},{"issue":"1","key":"1453_CR38","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3281659","volume":"37","author":"X Chen","year":"2018","unstructured":"Chen, X., Zhang, Y., Xu, H., Qin, Z., & Zha, H. (2018b). Adversarial distillation for efficient recommendation with external knowledge. ACM TOIS, 37(1), 1\u201328.","journal-title":"ACM TOIS"},{"key":"1453_CR39","doi-asserted-by":"crossref","unstructured":"Chen, X., Su, J., & Zhang, J. (2019b). A two-teacher tramework for knowledge distillation. In ISNN.","DOI":"10.1007\/978-3-030-22796-8_7"},{"key":"1453_CR40","doi-asserted-by":"crossref","unstructured":"Chen, Y., Wang, N., & Zhang, Z. (2018c). Darkrank: Accelerating deep metric learning via cross sample similarities transfer. In AAAI.","DOI":"10.1609\/aaai.v32i1.11783"},{"key":"1453_CR41","doi-asserted-by":"crossref","unstructured":"Chen, Y. C., Gan, Z., Cheng, Y., Liu, J., & Liu, J. (2020b). Distilling knowledge learned in BERT for text generation. In ACL.","DOI":"10.18653\/v1\/2020.acl-main.705"},{"key":"1453_CR42","doi-asserted-by":"crossref","unstructured":"Chen, Y. C., Lin, Y. Y., Yang, M. H., Huang, J. B. (2019c). Crdoco: Pixel-level domain transfer with cross-domain consistency. In CVPR.","DOI":"10.1109\/CVPR.2019.00189"},{"issue":"3","key":"1453_CR43","doi-asserted-by":"publisher","first-page":"1","DOI":"10.2200\/S00832ED1V01Y201802AIM037","volume":"12","author":"Z Chen","year":"2018","unstructured":"Chen, Z., & Liu, B. (2018). Lifelong machine learning. Synthesis Lectures on Artificial Intelligence and Machine Learning, 12(3), 1\u2013207.","journal-title":"Synthesis Lectures on Artificial Intelligence and Machine Learning"},{"key":"1453_CR44","doi-asserted-by":"crossref","unstructured":"Chen, Z., Zhu, L., Wan, L., Wang, S., Feng, W., & Heng, P. A. (2020c). A multi-task mean teacher for semi-supervised shadow detection. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00565"},{"issue":"1","key":"1453_CR45","doi-asserted-by":"publisher","first-page":"126","DOI":"10.1109\/MSP.2017.2765695","volume":"35","author":"Y Cheng","year":"2018","unstructured":"Cheng, Y., Wang, D., Zhou, P., & Zhang, T. (2018). Model compression and acceleration for deep neural networks: The principles, progress, and challenges. IEEE Signal Processing Magazine, 35(1), 126\u2013136.","journal-title":"IEEE Signal Processing Magazine"},{"key":"1453_CR46","doi-asserted-by":"crossref","unstructured":"Cheng, X., Rao, Z., Chen, Y., & Zhang, Q. (2020). Explaining knowledge distillation by quantifying the knowledge. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01294"},{"key":"1453_CR47","doi-asserted-by":"crossref","unstructured":"Cho, J. H. & Hariharan, B. (2019). On the efficacy of knowledge distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00489"},{"key":"1453_CR48","doi-asserted-by":"crossref","unstructured":"Chollet, F. (2017). Xception: Deep learning with depthwise separable convolutions. In CVPR.","DOI":"10.1109\/CVPR.2017.195"},{"key":"1453_CR49","unstructured":"Chung, I., Park, S., Kim, J. & Kwak, N. (2020). Feature-map-level online adversarial knowledge distillation. In ICML."},{"key":"1453_CR50","doi-asserted-by":"crossref","unstructured":"Clark, K., Luong, M. T., Khandelwal, U., Manning, C. D. & Le, Q. V. (2019). Bam! born-again multi-task networks for natural language understanding. In ACL.","DOI":"10.18653\/v1\/P19-1595"},{"key":"1453_CR51","unstructured":"Courbariaux, M., Bengio, Y. & David, J. P. (2015). Binaryconnect: Training deep neural networks with binary weights during propagations. In NeurIPS."},{"key":"1453_CR52","unstructured":"Crowley, E. J., Gray, G. & Storkey, A. J. (2018). Moonshine: Distilling with cheap convolutions. In NeurIPS."},{"key":"1453_CR53","doi-asserted-by":"crossref","unstructured":"Cui, J., Kingsbury, B., Ramabhadran, B., Saon, G., Sercu, T., Audhkhasi, K., et\u00a0al. (2017). Knowledge distillation across ensembles of multilingual models for low-resource languages. In ICASSP.","DOI":"10.1109\/ICASSP.2017.7953073"},{"key":"1453_CR54","unstructured":"Cui, Z., Song, T., Wang, Y., & Ji, Q. (2020). Knowledge augmented deep neural networks for joint facial expression and action unit recognition. In NeurIPS."},{"key":"1453_CR55","doi-asserted-by":"crossref","unstructured":"Cun, X., & Pun, C. M. (2020). Defocus blur detection via depth distillation. In ECCV.","DOI":"10.1007\/978-3-030-58601-0_44"},{"key":"1453_CR56","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L. J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In CVPR.","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"1453_CR57","unstructured":"Denton, E. L., Zaremba, W., Bruna, J., LeCun, Y. & Fergus, R. (2014). Exploiting linear structure within convolutional networks for efficient evaluation. In NeurIPS."},{"key":"1453_CR58","unstructured":"Devlin, J., Chang, M. W., Lee, K. & Toutanova, K. (2019). Bert: Pre-training of deep bidirectional transformers for language understanding. In NAACL-HLT ."},{"key":"1453_CR59","unstructured":"Ding, Q., Wu, S., Sun, H., Guo, J. & Xia, S. T. (2019). Adaptive regularization of labels. arXiv preprint arXiv:1908.05474."},{"key":"1453_CR60","doi-asserted-by":"crossref","unstructured":"Do, T., Do, T. T., Tran, H., Tjiputra, E. & Tran, Q. D. (2019). Compact trilinear interaction for visual question answering. In ICCV.","DOI":"10.1109\/ICCV.2019.00048"},{"key":"1453_CR61","doi-asserted-by":"crossref","unstructured":"Dong, X. & Yang, Y. (2019). Teacher supervises students how to learn from partially labeled images for facial landmark detection. In ICCV.","DOI":"10.1109\/ICCV.2019.00087"},{"issue":"7","key":"1453_CR62","first-page":"2415","volume":"39","author":"Q Dou","year":"2020","unstructured":"Dou, Q., Liu, Q., Heng, P. A., & Glocker, B. (2020). Unpaired multi-modal segmentation via knowledge distillation. IEEE TMI, 39(7), 2415\u20132425.","journal-title":"IEEE TMI"},{"key":"1453_CR63","unstructured":"Du, S., You, S., Li, X., Wu, J., Wang, F., Qian, C., & Zhang, C. (2020). Agree to disagree: Adaptive ensemble knowledge distillation in gradient space. In NeurIPS."},{"key":"1453_CR64","unstructured":"Duong, C. N., Luu, K., Quach, K. G. & Le, N. (2019.) ShrinkTeaNet: Million-scale lightweight face recognition via shrinking teacher-student networks. arXiv preprint arXiv:1905.10620."},{"key":"1453_CR65","unstructured":"Fakoor, R., Mueller, J. W., Erickson, N., Chaudhari, P., & Smola, A. J. (2020). Fast, Accurate, and Simple Models for Tabular Data via Augmented Distillation. In NeurIPS."},{"key":"1453_CR66","unstructured":"Flennerhag, S., Moreno, P. G., Lawrence, N. D. & Damianou, A. (2019). Transferring knowledge across learning processes. In ICLR."},{"key":"1453_CR67","unstructured":"Freitag, M., Al-Onaizan, Y. & Sankaran, B. (2017). Ensemble distillation for neural machine translation. arXiv preprint arXiv:1702.01802."},{"key":"1453_CR68","doi-asserted-by":"crossref","unstructured":"Fu, H., Zhou, S., Yang, Q., Tang, J., Liu, G., Liu, K., & Li, X. (2021). LRC-BERT: Latent-representation Contrastive Knowledge Distillation for Natural Language Understanding. In AAAI.","DOI":"10.1609\/aaai.v35i14.17518"},{"key":"1453_CR69","doi-asserted-by":"crossref","unstructured":"Fukuda, T., Suzuki, M., Kurata, G., Thomas, S., Cui, J. & Ramabhadran, B. (2017). Efficient knowledge distillation from an ensemble of teachers. In Interspeech.","DOI":"10.21437\/Interspeech.2017-614"},{"key":"1453_CR70","unstructured":"Furlanello, T., Lipton, Z., Tschannen, M., Itti, L. & Anandkumar, A. (2018). Born again neural networks. In ICML."},{"key":"1453_CR71","doi-asserted-by":"publisher","first-page":"105319","DOI":"10.1109\/ACCESS.2019.2931656","volume":"7","author":"L Gao","year":"2019","unstructured":"Gao, L., Mi, H., Zhu, B., Feng, D., Li, Y., & Peng, Y. (2019). An adversarial feature distillation method for audio classification. IEEE Access, 7, 105319\u2013105330.","journal-title":"IEEE Access"},{"key":"1453_CR72","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1016\/j.neucom.2020.10.113","volume":"433","author":"M Gao","year":"2021","unstructured":"Gao, M., Wang, Y., & Wan, L. (2021). Residual error based knowledge distillation. Neurocomputing, 433, 154\u2013161.","journal-title":"Neurocomputing"},{"issue":"5","key":"1453_CR73","first-page":"1524","volume":"39","author":"Z Gao","year":"2020","unstructured":"Gao, Z., Chung, J., Abdelrazek, M., Leung, S., Hau, W. K., Xian, Z., et al. (2020). Privileged modality distillation for vessel border detection in intracoronary imaging. IEEE TMI, 39(5), 1524\u20131534.","journal-title":"IEEE TMI"},{"key":"1453_CR74","doi-asserted-by":"crossref","unstructured":"Garcia, N. C., Morerio, P. & Murino, V. (2018). Modality distillation with multiple stream networks for action recognition. In ECCV.","DOI":"10.1007\/978-3-030-01237-3_7"},{"issue":"4","key":"1453_CR75","first-page":"2051","volume":"28","author":"S Ge","year":"2018","unstructured":"Ge, S., Zhao, S., Li, C., & Li, J. (2018). Low-resolution face recognition in the wild via selective knowledge distillation. IEEE TIP, 28(4), 2051\u20132062.","journal-title":"IEEE TIP"},{"key":"1453_CR76","first-page":"6898","volume":"29","author":"S Ge","year":"2020","unstructured":"Ge, S., Zhao, S., Li, C., Zhang, Y., & Li, J. (2020). Efficient low-resolution face recognition via bridge distillation. IEEE TIP, 29, 6898\u20136908.","journal-title":"IEEE TIP"},{"key":"1453_CR77","doi-asserted-by":"crossref","unstructured":"Ghorbani, S., Bulut, A. E. & Hansen, J. H. (2018). Advancing multi-accented lstm-ctc speech recognition using a domain specific student-teacher learning paradigm. In SLTW.","DOI":"10.1109\/SLT.2018.8639566"},{"key":"1453_CR78","doi-asserted-by":"crossref","unstructured":"Gil, Y., Chai, Y., Gorodissky, O. & Berant, J. (2019). White-to-black: Efficient distillation of black-box adversarial attacks. In NAACL-HLT.","DOI":"10.18653\/v1\/N19-1139"},{"key":"1453_CR79","doi-asserted-by":"crossref","unstructured":"Goldblum, M., Fowl, L., Feizi, S. & Goldstein, T. (2020). Adversarially robust distillation. In AAAI.","DOI":"10.1609\/aaai.v34i04.5816"},{"key":"1453_CR80","doi-asserted-by":"crossref","unstructured":"Gong, C., Chang, X., Fang, M. & Yang, J. (2018). Teaching semi-supervised classifier via generalized distillation. In IJCAI.","DOI":"10.24963\/ijcai.2018\/298"},{"issue":"6","key":"1453_CR81","first-page":"1452","volume":"28","author":"C Gong","year":"2017","unstructured":"Gong, C., Tao, D., Liu, W., Liu, L., & Yang, J. (2017). Label propagation via teaching-to-learn and learning-to-teach. TNNLS, 28(6), 1452\u20131465.","journal-title":"TNNLS"},{"key":"1453_CR82","unstructured":"Goodfellow, I., Pouget-Abadie, J., Mirza, M., Xu, B., Warde-Farley, D., Ozair, S., Courville, A., & Bengio, Y. (2014). Generative adversarial nets. In NeurIPS."},{"key":"1453_CR83","unstructured":"Gordon, M. A. & Duh, K. (2019). Explaining sequence-level knowledge distillation as data-augmentation for neural machine translation. arXiv preprint arXiv:1912.03334."},{"key":"1453_CR84","unstructured":"Gu, J., & Tresp, V. (2020). Search for better students to learn distilled knowledge. In ECAI."},{"key":"1453_CR85","doi-asserted-by":"crossref","unstructured":"Guan, Y., Zhao, P., Wang, B., Zhang, Y., Yao, C., Bian, K., & Tang, J. (2020). Differentiable feature aggregation search for knowledge distillation. In ECCV.","DOI":"10.1007\/978-3-030-58520-4_28"},{"key":"1453_CR86","doi-asserted-by":"crossref","unstructured":"Guo, Q., Wang, X., Wu, Y., Yu, Z., Liang, D., Hu, X., & Luo, P. (2020). Online knowledge distillation via collaborative learning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01103"},{"key":"1453_CR87","doi-asserted-by":"crossref","unstructured":"Gupta, S., Hoffman, J. & Malik, J. (2016). Cross modal distillation for supervision transfer. In CVPR.","DOI":"10.1109\/CVPR.2016.309"},{"key":"1453_CR88","doi-asserted-by":"crossref","unstructured":"Hahn, S. & Choi, H. (2019). Self-knowledge distillation in natural language processing. In RANLP.","DOI":"10.26615\/978-954-452-056-4_050"},{"key":"1453_CR89","doi-asserted-by":"crossref","unstructured":"Haidar, M. A. & Rezagholizadeh, M. (2019). Textkd-gan: Text generation using knowledge distillation and generative adversarial networks. In Canadian conference on artificial intelligence.","DOI":"10.1007\/978-3-030-18305-9_9"},{"key":"1453_CR90","unstructured":"Han, S., Pool, J., Tran, J. & Dally, W. (2015). Learning both weights and connections for efficient neural network. In NeurIPS."},{"key":"1453_CR91","doi-asserted-by":"publisher","first-page":"13","DOI":"10.1016\/j.patcog.2019.03.005","volume":"92","author":"W Hao","year":"2019","unstructured":"Hao, W., & Zhang, Z. (2019). Spatiotemporal distilled dense-connectivity network for video action recognition. Pattern Recognition, 92, 13\u201324.","journal-title":"Pattern Recognition"},{"key":"1453_CR92","doi-asserted-by":"crossref","unstructured":"Haroush, M., Hubara, I., Hoffer, E., & Soudry, D. (2020). The knowledge within: Methods for data-free model compression. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00852"},{"key":"1453_CR93","unstructured":"He, C., Annavaram, M., & Avestimehr, S. (2020a). Group knowledge transfer: Federated learning of large CNNs at the edge. In NeurIPS."},{"issue":"12","key":"1453_CR94","first-page":"5349","volume":"31","author":"F He","year":"2020","unstructured":"He, F., Liu, T., & Tao, D. (2020b). Why resnet works? residuals generalize. IEEE TNNLS, 31(12), 5349\u20135362.","journal-title":"IEEE TNNLS"},{"key":"1453_CR95","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S. & Sun, J. (2016) Deep residual learning for image recognition. In CVPR.","DOI":"10.1109\/CVPR.2016.90"},{"key":"1453_CR96","doi-asserted-by":"crossref","unstructured":"He, T., Shen, C., Tian, Z., Gong, D., Sun, C. & Yan, Y. (2019). Knowledge adaptation for efficient semantic segmentation. In CVPR.","DOI":"10.1109\/CVPR.2019.00067"},{"key":"1453_CR97","doi-asserted-by":"crossref","unstructured":"Heo, B., Kim, J., Yun, S., Park, H., Kwak, N., & Choi, J. Y. (2019a). A comprehensive overhaul of feature distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00201"},{"key":"1453_CR98","doi-asserted-by":"crossref","unstructured":"Heo, B., Lee, M., Yun, S. & Choi, J. Y. (2019b). Knowledge distillation with adversarial samples supporting decision boundary. In AAAI.","DOI":"10.1609\/aaai.v33i01.33013771"},{"key":"1453_CR99","doi-asserted-by":"crossref","unstructured":"Heo, B., Lee, M., Yun, S. & Choi, J. Y. (2019c). Knowledge transfer via distillation of activation boundaries formed by hidden neurons. In AAAI.","DOI":"10.1609\/aaai.v33i01.33013779"},{"key":"1453_CR100","unstructured":"Hinton, G., Vinyals, O. & Dean, J. (2015). Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531."},{"key":"1453_CR101","doi-asserted-by":"crossref","unstructured":"Hoffman, J., Gupta, S. & Darrell, T. (2016). Learning with side information through modality hallucination. In CVPR.","DOI":"10.1109\/CVPR.2016.96"},{"key":"1453_CR102","unstructured":"Hong, W. & Yu, J. (2019). Gan-knowledge distillation for one-stage object detection. arXiv preprint arXiv:1906.08467."},{"key":"1453_CR103","doi-asserted-by":"crossref","unstructured":"Hou, Y., Ma, Z., Liu, C. & Loy, CC. (2019). Learning lightweight lane detection cnns by self attention distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00110"},{"key":"1453_CR104","doi-asserted-by":"crossref","unstructured":"Hou, Y., Ma, Z., Liu, C., Hui, T. W., & Loy, C. C. (2020). Inter-Region Affinity Distillation for Road Marking Segmentation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01250"},{"key":"1453_CR105","unstructured":"Howard, A. G., Zhu, M., Chen, B., Kalenichenko, D., Wang, W., Weyand, T., Andreetto, M., & Adam, H. (2017). Mobilenets: Efficient convolutional neural networks for mobile vision applications. arXiv preprint arXiv:1704.04861."},{"key":"1453_CR106","doi-asserted-by":"crossref","unstructured":"Hu, H., Xie, L., Hong, R., & Tian, Q. (2020). Creating something from nothing: Unsupervised knowledge distillation for cross-modal hashing. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00319"},{"key":"1453_CR107","doi-asserted-by":"crossref","unstructured":"Hu, M., Peng, Y., Wei, F., Huang, Z., Li, D., Yang, N., et\u00a0al. (2018). Attention-guided answer distillation for machine reading comprehension. In EMNLP.","DOI":"10.18653\/v1\/D18-1232"},{"key":"1453_CR108","doi-asserted-by":"crossref","unstructured":"Huang, G., Liu, Z., Van, Der\u00a0Maaten, L. & Weinberger, K. Q. (2017). Densely connected convolutional networks. In CVPR.","DOI":"10.1109\/CVPR.2017.243"},{"key":"1453_CR109","doi-asserted-by":"crossref","unstructured":"Huang, M., You, Y., Chen, Z., Qian, Y. & Yu, K. (2018). Knowledge distillation for sequence model. In Interspeech.","DOI":"10.21437\/Interspeech.2018-1589"},{"key":"1453_CR110","unstructured":"Huang, Z. & Wang, N. (2017). Like what you like: Knowledge distill via neuron selectivity transfer. arXiv preprint arXiv:1707.01219."},{"key":"1453_CR111","unstructured":"Huang, Z., Zou, Y., Bhagavatula, V., & Huang, D. (2020). Comprehensive attention self-distillation for weakly-supervised object detection. In NeurIPS."},{"key":"1453_CR112","unstructured":"Ioffe, S., & Szegedy, C. (2015). Batch normalization: Accelerating deep network training by reducing internal covariate shift. In ICML"},{"key":"1453_CR113","unstructured":"Jang, Y., Lee, H., Hwang, S. J. & Shin, J. (2019). Learning what and where to transfer. In ICML."},{"key":"1453_CR114","unstructured":"Ji, G., & Zhu, Z. (2020). Knowledge distillation in wide neural networks: Risk bound, data efficiency and imperfect teacher. In NeurIPS."},{"key":"1453_CR115","doi-asserted-by":"crossref","unstructured":"Jiao, X., Yin, Y., Shang, L., Jiang, X., Chen, X., Li, L., et\u00a0al. (2020). Tinybert: Distilling bert for natural language understanding. In EMNLP.","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"1453_CR116","doi-asserted-by":"crossref","unstructured":"Jin, X., Peng, B., Wu, Y., Liu, Y., Liu, J., Liang, D., Yan, J., & Hu, X. (2019). Knowledge distillation via route constrained optimization. In ICCV.","DOI":"10.1109\/ICCV.2019.00143"},{"key":"1453_CR117","doi-asserted-by":"crossref","unstructured":"Kang, M., Mun, J. & Han, B. (2020). Towards oracle knowledge distillation with neural architecture search. In AAAI.","DOI":"10.1609\/aaai.v34i04.5866"},{"key":"1453_CR118","unstructured":"Kim, J., Park, S. & Kwak, N. (2018). Paraphrasing complex network: Network compression via factor transfer. In NeurIPS."},{"key":"1453_CR119","unstructured":"Kim, J., Bhalgat, Y., Lee, J., Patel, C., & Kwak, N. (2019a). QKD: Quantization-aware Knowledge Distillation. arXiv preprint arXiv:1911.12491."},{"key":"1453_CR120","unstructured":"Kim, J., Hyun, M., Chung, I. & Kwak, N. (2019b). Feature fusion for online mutual knowledge distillation. In ICPR."},{"key":"1453_CR121","unstructured":"Kim, S. W. & Kim, H. E. (2017). Transferring knowledge to smaller network with class-distance loss. In ICLRW."},{"key":"1453_CR122","doi-asserted-by":"crossref","unstructured":"Kim, Y., Rush & A. M. (2016). Sequence-level knowledge distillation. In EMNLP.","DOI":"10.18653\/v1\/D16-1139"},{"key":"1453_CR123","unstructured":"Kimura, A., Ghahramani, Z., Takeuchi, K., Iwata, T. & Ueda, N. (2018). Few-shot learning of neural networks from scratch by pseudo example optimization. In BMVC."},{"key":"1453_CR124","doi-asserted-by":"crossref","unstructured":"Kwon, K., Na, H., Lee, H., & Kim, N. S. (2020). Adaptive knowledge distillation based on entropy. In ICASSP.","DOI":"10.1109\/ICASSP40776.2020.9054698"},{"key":"1453_CR125","unstructured":"Kong, H., Zhao, J., Tu, X., Xing, J., Shen, S. & Feng, J. (2019). Cross-resolution face recognition via prior-aided face hallucination and residual knowledge distillation. arXiv preprint arXiv:1905.10777."},{"key":"1453_CR126","unstructured":"Krizhevsky, A., & Hinton, G. (2009). Learning multiple layers of features from tiny images."},{"key":"1453_CR127","unstructured":"Krizhevsky, A., Sutskever, I. & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In NeurIPS."},{"key":"1453_CR128","doi-asserted-by":"crossref","unstructured":"Kuncoro, A., Ballesteros, M., Kong, L., Dyer, C. & Smith, N. A. (2016). Distilling an ensemble of greedy dependency parsers into one mst parser. In EMNLP.","DOI":"10.18653\/v1\/D16-1180"},{"key":"1453_CR129","doi-asserted-by":"crossref","unstructured":"Kundu, J. N., Lakkakula, N. & Babu, R. V. (2019). Um-adapt: Unsupervised multi-task adaptation using adversarial cross-task distillation. In CVPR.","DOI":"10.1109\/ICCV.2019.00152"},{"key":"1453_CR130","doi-asserted-by":"crossref","unstructured":"Lai, K. H., Zha, D., Li, Y., & Hu, X. (2020). Dual policy distillation. In IJCAI.","DOI":"10.24963\/ijcai.2020\/435"},{"key":"1453_CR131","unstructured":"Lan, X., Zhu, X., & Gong, S. (2018). Self-referenced deep learning. In ACCV."},{"key":"1453_CR132","unstructured":"Lee, H., Hwang, S. J. & Shin, J. (2019a). Rethinking data augmentation: Self-supervision and self-distillation. arXiv preprint arXiv:1910.05872."},{"key":"1453_CR133","doi-asserted-by":"crossref","unstructured":"Lee, K., Lee, K., Shin, J. & Lee, H. (2019b). Overcoming catastrophic forgetting with unlabeled data in the wild. In ICCV.","DOI":"10.1109\/ICCV.2019.00040"},{"key":"1453_CR134","unstructured":"Lee, K., Nguyen, L. T. & Shim, B. (2019c). Stochasticity and skip connections improve knowledge transfer. In AAAI."},{"key":"1453_CR135","unstructured":"Lee, S. & Song, B. (2019). Graph-based knowledge distillation by multi-head attention network. In BMVC."},{"key":"1453_CR136","doi-asserted-by":"crossref","unstructured":"Lee, S. H., Kim, D. H. & Song, B. C. (2018). Self-supervised knowledge distillation using singular value decomposition. In ECCV.","DOI":"10.1007\/978-3-030-01231-1_21"},{"key":"1453_CR137","doi-asserted-by":"crossref","unstructured":"Li, B., Wang, Z., Liu, H., Du, Q., Xiao, T., Zhang, C., & Zhu, J. (2021). Learning light-weight translation models from deep transformer. In AAAI.","DOI":"10.1609\/aaai.v35i15.17561"},{"key":"1453_CR138","doi-asserted-by":"crossref","unstructured":"Li, C., Peng, J., Yuan, L., Wang, G., Liang, X., Lin, L., & Chang, X. (2020a). Blockwisely supervised neural architecture search with knowledge distillation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00206"},{"key":"1453_CR139","unstructured":"Li, G., Zhang, J., Wang, Y., Liu, C., Tan, M., Lin, Y., Zhang, W., Feng, J., & Zhang, T. (2020b). Residual distillation: Towards portable deep neural networks without shortcuts. In NeurIPS."},{"key":"1453_CR140","first-page":"1902","volume":"29","author":"J Li","year":"2019","unstructured":"Li, J., Fu, K., Zhao, S., & Ge, S. (2019). Spatiotemporal knowledge distillation for efficient estimation of aerial video saliency. IEEE TIP, 29, 1902\u20131914.","journal-title":"IEEE TIP"},{"key":"1453_CR141","doi-asserted-by":"crossref","unstructured":"Li, M., Lin, J., Ding, Y., Liu, Z., Zhu, J. Y., & Han, S. (2020c). Gan compression: Efficient architectures for interactive conditional gans. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00533"},{"key":"1453_CR142","doi-asserted-by":"crossref","unstructured":"Li, Q., Jin, S. & Yan, J. (2017). Mimicking very efficient network for object detection. In CVPR.","DOI":"10.1109\/CVPR.2017.776"},{"key":"1453_CR143","doi-asserted-by":"crossref","unstructured":"Li, T., Li, J., Liu, Z., & Zhang, C. (2020d). Few sample knowledge distillation for efficient network compression. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01465"},{"key":"1453_CR144","doi-asserted-by":"crossref","unstructured":"Li, X., Wu, J., Fang, H., Liao, Y., Wang, F., & Qian, C. (2020e). Local correlation consistency for knowledge distillation. In ECCV.","DOI":"10.1007\/978-3-030-58610-2_2"},{"issue":"12","key":"1453_CR145","doi-asserted-by":"publisher","first-page":"2935","DOI":"10.1109\/TPAMI.2017.2773081","volume":"40","author":"Z Li","year":"2017","unstructured":"Li, Z., & Hoiem, D. (2017). Learning without forgetting. IEEE TPAMI, 40(12), 2935\u20132947.","journal-title":"IEEE TPAMI"},{"key":"1453_CR146","unstructured":"Lin, T., Kong, L., Stich, S. U., & Jaggi, M. (2020). Ensemble distillation for robust model fusion in federated learning. In NeurIPS."},{"key":"1453_CR147","unstructured":"Liu, I. J., Peng, J. & Schwing, A. G. (2019a). Knowledge flow: Improve upon your teachers. In ICLR."},{"key":"1453_CR148","doi-asserted-by":"crossref","unstructured":"Liu, J., Chen, Y. & Liu, K. (2019b). Exploiting the ground-truth: An adversarial imitation based knowledge distillation approach for event detection. In AAAI.","DOI":"10.1609\/aaai.v33i01.33016754"},{"key":"1453_CR149","doi-asserted-by":"crossref","unstructured":"Liu, J., Wen, D., Gao, H., Tao, W., Chen, T. W., Osa, K., et\u00a0al. (2019c). Knowledge representing: efficient, sparse representation of prior knowledge for knowledge distillation. In CVPRW.","DOI":"10.1109\/CVPRW.2019.00090"},{"key":"1453_CR150","doi-asserted-by":"crossref","unstructured":"Liu, P., King, I., Lyu, M. R., & Xu, J. (2019d). DDFlow: Learning optical flow with unlabeled data distillation. In AAAI.","DOI":"10.1609\/aaai.v33i01.33018770"},{"key":"1453_CR151","doi-asserted-by":"crossref","unstructured":"Liu, P., Liu, W., Ma, H., Mei, T. & Seok, M. (2020a). Ktan: knowledge transfer adversarial network. In IJCNN.","DOI":"10.1109\/IJCNN48605.2020.9207235"},{"key":"1453_CR152","doi-asserted-by":"crossref","unstructured":"Liu, Q., Xie, L., Wang, H., Yuille & A. L. (2019e). Semantic-aware knowledge preservation for zero-shot sketch-based image retrieval. In ICCV.","DOI":"10.1109\/ICCV.2019.00376"},{"key":"1453_CR153","unstructured":"Liu, R., Fusi, N. & Mackey, L. (2018). Model compression with generative adversarial networks. arXiv preprint arXiv:1812.02271."},{"key":"1453_CR154","doi-asserted-by":"crossref","unstructured":"Liu, W., Zhou, P., Zhao, Z., Wang, Z., Deng, H., & Ju, Q. (2020b). FastBERT: a self-distilling BERT with adaptive inference time. In ACL.","DOI":"10.18653\/v1\/2020.acl-main.537"},{"key":"1453_CR155","doi-asserted-by":"crossref","unstructured":"Liu, X., Wang, X. & Matwin, S. (2018b). Improving the interpretability of deep neural networks with knowledge distillation. In ICDMW.","DOI":"10.1109\/ICDMW.2018.00132"},{"key":"1453_CR156","unstructured":"Liu, X., He, P., Chen, W. & Gao, J. (2019f). Improving multi-task deep neural networks via knowledge distillation for natural language understanding. arXiv preprint arXiv:1904.09482."},{"key":"1453_CR157","doi-asserted-by":"crossref","unstructured":"Liu, Y., Cao, J., Li, B., Yuan, C., Hu, W., Li, Y. & Duan, Y. (2019g). Knowledge distillation via instance relationship graph. In CVPR.","DOI":"10.1109\/CVPR.2019.00726"},{"key":"1453_CR158","doi-asserted-by":"crossref","unstructured":"Liu, Y., Chen, K., Liu, C., Qin, Z., Luo, Z. & Wang, J. (2019h). Structured knowledge distillation for semantic segmentation. In CVPR.","DOI":"10.1109\/CVPR.2019.00271"},{"key":"1453_CR159","doi-asserted-by":"crossref","unstructured":"Liu, Y., Jia, X., Tan, M., Vemulapalli, R., Zhu, Y., Green, B., et\u00a0al. (2019i). Search to distill: Pearls are everywhere but not the eyes. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00756"},{"key":"1453_CR160","doi-asserted-by":"publisher","first-page":"106","DOI":"10.1016\/j.neucom.2020.07.048","volume":"415","author":"Y Liu","year":"2020","unstructured":"Liu, Y., Zhang, W., & Wang, J. (2020c). Adaptive multi-teacher multi-level knowledge distillation. Neurocomputing, 415, 106\u2013113.","journal-title":"Neurocomputing"},{"key":"1453_CR161","unstructured":"Lopes, R. G., Fenu, S. & Starner, T. (2017). Data-free knowledge distillation for deep neural networks. In NeurIPS."},{"key":"1453_CR162","unstructured":"Lopez-Paz, D., Bottou, L., Sch\u00f6lkopf, B. & Vapnik, V. (2016). Unifying distillation and privileged information. In ICLR."},{"key":"1453_CR163","doi-asserted-by":"crossref","unstructured":"Lu, L., Guo, M. & Renals, S. (2017). Knowledge distillation for small-footprint highway networks. In ICASSP.","DOI":"10.1109\/ICASSP.2017.7953072"},{"key":"1453_CR164","doi-asserted-by":"crossref","unstructured":"Luo, P., Zhu, Z., Liu, Z., Wang, X. & Tang, X. (2016). Face model compression by distilling knowledge from neurons. In AAAI.","DOI":"10.1609\/aaai.v30i1.10449"},{"key":"1453_CR165","doi-asserted-by":"crossref","unstructured":"Luo, S., Pan, W., Wang, X., Wang, D., Tang, H., & Song, M. (2020). Collaboration by competition: Self-coordinated knowledge amalgamation for multi-talent student learning. In ECCV.","DOI":"10.1007\/978-3-030-58539-6_38"},{"key":"1453_CR166","doi-asserted-by":"crossref","unstructured":"Luo, S., Wang, X., Fang, G., Hu, Y., Tao, D., & Song, M. (2019). Knowledge amalgamation from heterogeneous networks by common feature learning. In IJCAI.","DOI":"10.24963\/ijcai.2019\/428"},{"key":"1453_CR167","doi-asserted-by":"crossref","unstructured":"Luo, Z., Hsieh, J. T., Jiang, L., Carlos\u00a0Niebles, J. & Fei-Fei, L. (2018). Graph distillation for action detection with privileged modalities. In ECCV.","DOI":"10.1007\/978-3-030-01264-9_11"},{"key":"1453_CR168","unstructured":"Macko, V., Weill, C., Mazzawi, H. & Gonzalvo, J. (2019). Improving neural architecture search image classifiers via ensemble learning. In NeurIPS workshop."},{"key":"1453_CR169","unstructured":"Ma, J., & Mei, Q. (2019). Graph representation learning via multi-task knowledge distillation. arXiv preprint arXiv:1911.05700."},{"key":"1453_CR170","doi-asserted-by":"crossref","unstructured":"Ma, N., Zhang, X., Zheng, H. T., & Sun, J. (2018). Shufflenet v2: Practical guidelines for efficient CNN architecture design. In ECCV.","DOI":"10.1007\/978-3-030-01264-9_8"},{"key":"1453_CR171","doi-asserted-by":"crossref","unstructured":"Meng, Z., Li, J., Zhao, Y. & Gong, Y. (2019). Conditional teacher-student learning. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8683438"},{"key":"1453_CR172","unstructured":"Micaelli, P. & Storkey, A. J. (2019). Zero-shot knowledge transfer via adversarial belief matching. In NeurIPS."},{"key":"1453_CR173","unstructured":"Minami, S., Hirakawa, T., Yamashita, T. & Fujiyoshi, H. (2019). Knowledge transfer graph for deep collaborative learning. arXiv preprint arXiv:1909.04286."},{"key":"1453_CR174","doi-asserted-by":"crossref","unstructured":"Mirzadeh, S. I., Farajtabar, M., Li, A. & Ghasemzadeh, H. (2020). Improved knowledge distillation via teacher assistant. In AAAI.","DOI":"10.1609\/aaai.v34i04.5963"},{"key":"1453_CR175","unstructured":"Mishra, A. & Marr, D. (2018). Apprentice: Using knowledge distillation techniques to improve low-precision network accuracy. In ICLR."},{"key":"1453_CR176","unstructured":"Mobahi, H., Farajtabar, M., & Bartlett, P. L. (2020). Self-distillation amplifies regularization in hilbert space. In NeurIPS."},{"key":"1453_CR177","doi-asserted-by":"crossref","unstructured":"Mou, L., Jia, R., Xu, Y., Li, G., Zhang, L. & Jin, Z. (2016). Distilling word embeddings: An encoding approach. In CIKM.","DOI":"10.1145\/2983323.2983888"},{"key":"1453_CR178","doi-asserted-by":"crossref","unstructured":"Mukherjee, P., Das, A., Bhunia, A. K. & Roy, P. P. (2019). Cogni-net: Cognitive feature learning through deep visual perception. In ICIP.","DOI":"10.1109\/ICIP.2019.8803717"},{"key":"1453_CR179","doi-asserted-by":"crossref","unstructured":"Mullapudi, R. T., Chen, S., Zhang, K., Ramanan, D. & Fatahalian, K. (2019). Online model distillation for efficient video inference. In ICCV.","DOI":"10.1109\/ICCV.2019.00367"},{"key":"1453_CR180","unstructured":"Muller, R., Kornblith, S. & Hinton, G. E. (2019). When does label smoothing help? In NeurIPS."},{"key":"1453_CR181","unstructured":"Mun, J., Lee, K., Shin, J. & Han, B. (2018). Learning to specialize with knowledge distillation for visual question answering. In NeurIPS."},{"key":"1453_CR182","unstructured":"Munjal, B., Galasso, F. & Amin, S. (2019). Knowledge distillation for end-to-end person search. In BMVC."},{"key":"1453_CR183","doi-asserted-by":"crossref","unstructured":"Nakashole, N. & Flauger, R. (2017). Knowledge distillation for bilingual dictionary induction. In EMNLP.","DOI":"10.18653\/v1\/D17-1264"},{"key":"1453_CR184","doi-asserted-by":"crossref","unstructured":"Nayak, G. K., Mopuri, K. R., & Chakraborty, A. (2021). Effectiveness of arbitrary transfer sets for data-free knowledge distillation. In WACV.","DOI":"10.1109\/WACV48630.2021.00147"},{"key":"1453_CR185","unstructured":"Nayak, G. K., Mopuri, K. R., Shaj, V., Babu, R. V. & Chakraborty, A. (2019). Zero-shot knowledge distillation in deep networks. In ICML."},{"key":"1453_CR186","doi-asserted-by":"crossref","unstructured":"Ng, R. W., Liu, X. & Swietojanski, P. (2018). Teacher-student training for text-independent speaker recognition. In SLTW.","DOI":"10.1109\/SLT.2018.8639564"},{"key":"1453_CR187","doi-asserted-by":"crossref","unstructured":"Nie, X., Li, Y., Luo, L., Zhang, N. & Feng, J. (2019). Dynamic kernel distillation for efficient pose estimation in videos. In ICCV.","DOI":"10.1109\/ICCV.2019.00704"},{"key":"1453_CR188","doi-asserted-by":"crossref","unstructured":"Noroozi, M., Vinjimoor, A., Favaro, P. & Pirsiavash, H. (2018). Boosting self-supervised learning via knowledge transfer. In CVPR.","DOI":"10.1109\/CVPR.2018.00975"},{"key":"1453_CR189","unstructured":"Nowak, T. S. & Corso, J. J. (2018). Deep net triage: Analyzing the importance of network layers via structural compression. arXiv preprint arXiv:1801.04651."},{"key":"1453_CR190","unstructured":"Oord, A., Li, Y., Babuschkin, I., Simonyan, K., Vinyals, O., Kavukcuoglu, K., et\u00a0al. (2018). Parallel wavenet: Fast high-fidelity speech synthesis. In ICML."},{"key":"1453_CR191","doi-asserted-by":"crossref","unstructured":"Pan, B., Cai, H., Huang, D. A., Lee, K. H., Gaidon, A., Adeli, E., & Niebles, J. C. (2020). Spatio-temporal graph for video captioning with knowledge distillation. In CVPR","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"1453_CR192","doi-asserted-by":"publisher","first-page":"137","DOI":"10.1016\/j.neucom.2018.12.025","volume":"332","author":"Y Pan","year":"2019","unstructured":"Pan, Y., He, F., & Yu, H. (2019). A novel enhanced collaborative autoencoder with knowledge distillation for top-n recommender systems. Neurocomputing, 332, 137\u2013148.","journal-title":"Neurocomputing"},{"key":"1453_CR193","unstructured":"Papernot, N., Abadi, M., Erlingsson, U., Goodfellow, I. & Talwar, K. (2017). Semi-supervised knowledge transfer for deep learning from private training data. In ICLR."},{"key":"1453_CR194","doi-asserted-by":"crossref","unstructured":"Papernot, N., McDaniel, P., Wu, X., Jha, S. & Swami, A. (2016). Distillation as a defense to adversarial perturbations against deep neural networks. In IEEE SP.","DOI":"10.1109\/SP.2016.41"},{"key":"1453_CR195","unstructured":"Park, S. & Kwak, N. (2020). Feature-level ensemble knowledge distillation for aggregating knowledge from multiple networks. In ECAI."},{"key":"1453_CR196","doi-asserted-by":"crossref","unstructured":"Park, W., Kim, D., Lu, Y. & Cho, M. (2019). Relational knowledge distillation. In CVPR.","DOI":"10.1109\/CVPR.2019.00409"},{"key":"1453_CR197","doi-asserted-by":"crossref","unstructured":"Passban, P., Wu, Y., Rezagholizadeh, M., & Liu, Q. (2021). ALP-KD: Attention-based layer projection for knowledge distillation. In AAAI.","DOI":"10.1609\/aaai.v35i15.17610"},{"key":"1453_CR198","doi-asserted-by":"crossref","unstructured":"Passalis, N. & Tefas, A. (2018). Learning deep representations with probabilistic knowledge transfer. In ECCV.","DOI":"10.1007\/978-3-030-01252-6_17"},{"key":"1453_CR199","doi-asserted-by":"publisher","unstructured":"Passalis, N., Tzelepi, M., & Tefas, A. (2020a). Probabilistic knowledge transfer for lightweight deep representation learning. TNNLS. https:\/\/doi.org\/10.1109\/TNNLS.2020.2995884.","DOI":"10.1109\/TNNLS.2020.2995884"},{"key":"1453_CR200","doi-asserted-by":"crossref","unstructured":"Passalis, N., Tzelepi, M., & Tefas, A. (2020b). Heterogeneous knowledge distillation using information flow modeling. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00241"},{"key":"1453_CR201","doi-asserted-by":"crossref","unstructured":"Peng, B., Jin, X., Liu, J., Li, D., Wu, Y., Liu, Y., et\u00a0al. (2019a). Correlation congruence for knowledge distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00511"},{"key":"1453_CR202","unstructured":"Peng, H., Du, H., Yu, H., Li, Q., Liao, J., & Fu, J. (2020). Cream of the crop: Distilling prioritized paths for one-shot neural architecture search. In NeurIPS."},{"key":"1453_CR203","doi-asserted-by":"crossref","unstructured":"Peng, Z., Li, Z., Zhang, J., Li, Y., Qi, G. J. & Tang, J. (2019b). Few-shot image recognition with knowledge transfer. In ICCV.","DOI":"10.1109\/ICCV.2019.00053"},{"key":"1453_CR204","doi-asserted-by":"crossref","unstructured":"Perez, A., Sanguineti, V., Morerio, P. & Murino, V. (2020). Audio-visual model distillation using acoustic images. In WACV.","DOI":"10.1109\/WACV45572.2020.9093307"},{"key":"1453_CR205","unstructured":"Phuong, M., & Lampert, C. H. (2019a). Towards understanding knowledge distillation. In ICML."},{"key":"1453_CR206","doi-asserted-by":"crossref","unstructured":"Phuong, M., & Lampert, C. H. (2019b). Distillation-based training for multi-exit architectures. In ICCV.","DOI":"10.1109\/ICCV.2019.00144"},{"key":"1453_CR207","doi-asserted-by":"crossref","unstructured":"Pilzer, A., Lathuiliere, S., Sebe, N. & Ricci, E. (2019). Refine and distill: Exploiting cycle-inconsistency and knowledge distillation for unsupervised monocular depth estimation. In CVPR.","DOI":"10.1109\/CVPR.2019.01000"},{"key":"1453_CR208","unstructured":"Polino, A., Pascanu, R. & Alistarh, D. (2018). Model compression via distillation and quantization. In ICLR."},{"issue":"1","key":"1453_CR209","doi-asserted-by":"publisher","first-page":"10","DOI":"10.1186\/s13636-016-0088-7","volume":"2016","author":"R Price","year":"2016","unstructured":"Price, R., Iso, K., & Shinoda, K. (2016). Wise teachers train better DNN acoustic models. EURASIP Journal on Audio, Speech, and Music Processing, 2016(1), 10.","journal-title":"EURASIP Journal on Audio, Speech, and Music Processing"},{"key":"1453_CR210","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Dollar, P., Girshick, R., Gkioxari, G., & He, K. (2018). Data distillation: Towards omni-supervised learning. In CVPR.","DOI":"10.1109\/CVPR.2018.00433"},{"key":"1453_CR211","doi-asserted-by":"crossref","unstructured":"Radosavovic, I., Kosaraju, R. P., Girshick, R., He, K., & Dollar P. (2020). Designing network design spaces. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01044"},{"key":"1453_CR212","doi-asserted-by":"crossref","unstructured":"Roheda, S., Riggan, B. S., Krim, H. & Dai, L. (2018). Cross-modality distillation: A case for conditional generative adversarial networks. In ICASSP.","DOI":"10.1109\/ICASSP.2018.8462082"},{"key":"1453_CR213","unstructured":"Romero, A., Ballas, N., Kahou, S. E., Chassang, A., Gatta, C., & Bengio, Y. (2015). Fitnets: Hints for thin deep nets. In ICLR."},{"key":"1453_CR214","doi-asserted-by":"crossref","unstructured":"Ross, A. S. & Doshi-Velez, F. (2018). Improving the adversarial robustness and interpretability of deep neural networks by regularizing their input gradients. In AAAI.","DOI":"10.1609\/aaai.v32i1.11504"},{"key":"1453_CR215","unstructured":"Ruder, S., Ghaffari, P. & Breslin, J. G. (2017). Knowledge adaptation: Teaching to adapt. arXiv preprint arXiv:1702.02052."},{"key":"1453_CR216","doi-asserted-by":"crossref","unstructured":"Sandler, M., Howard, A., Zhu, M., Zhmoginov, A., & Chen, L. C. (2018). Mobilenetv2: Inverted residuals and linear bottlenecks. In CVPR.","DOI":"10.1109\/CVPR.2018.00474"},{"key":"1453_CR217","unstructured":"Sanh, V., Debut, L., Chaumond, J. & Wolf, T. (2019). Distilbert, a distilled version of bert: smaller, faster, cheaper and lighter. arXiv preprint arXiv:1910.01108."},{"key":"1453_CR218","doi-asserted-by":"crossref","unstructured":"Saputra, M. R. U., de\u00a0Gusmao, P. P., Almalioglu, Y., Markham, A. & Trigoni, N. (2019). Distilling knowledge from a deep pose regressor network. In ICCV.","DOI":"10.1109\/ICCV.2019.00035"},{"key":"1453_CR219","unstructured":"Sau, B. B. & Balasubramanian, V. N. (2016). Deep model compression: Distilling knowledge from noisy teachers. arXiv preprint arXiv:1610.09650."},{"key":"1453_CR220","unstructured":"Seo, H., Park, J., Oh, S., Bennis, M., & Kim, S. L. (2020). Federated Knowledge Distillation. arXiv preprint arXiv:2011.02367."},{"key":"1453_CR221","unstructured":"Shakeri, S., Sethy, A. & Cheng, C. (2019). Knowledge distillation in document retrieval. arXiv preprint arXiv:1911.11065."},{"key":"1453_CR222","doi-asserted-by":"crossref","unstructured":"Shen, C., Wang, X., Song, J., Sun, L., & Song, M. (2019a). Amalgamating knowledge towards comprehensive classification. In AAAI.","DOI":"10.1609\/aaai.v33i01.33013068"},{"key":"1453_CR223","doi-asserted-by":"crossref","unstructured":"Shen, C., Wang, X., Yin, Y., Song, J., Luo, S., & Song, M. (2021). Progressive network grafting for few-shot knowledge distillation. In AAAI.","DOI":"10.1609\/aaai.v35i3.16356"},{"key":"1453_CR224","doi-asserted-by":"crossref","unstructured":"Shen, C., Xue, M., Wang, X., Song, J., Sun, L., & Song, M. (2019b). Customizing student networks from heterogeneous teachers via adaptive knowledge amalgamation. In ICCV.","DOI":"10.1109\/ICCV.2019.00360"},{"key":"1453_CR225","unstructured":"Shen, J., Vesdapunt, N., Boddeti, V. N. & Kitani, K. M. (2016). In teacher we trust: Learning compressed models for pedestrian detection. arXiv preprint arXiv:1612.00478."},{"key":"1453_CR226","doi-asserted-by":"crossref","unstructured":"Shen, P., Lu, X., Li, S. & Kawai, H. (2018). Feature representation of short utterances based on knowledge distillation for spoken language identification. In Interspeech.","DOI":"10.21437\/Interspeech.2018-1519"},{"key":"1453_CR227","doi-asserted-by":"publisher","first-page":"2674","DOI":"10.1109\/TASLP.2020.3023627","volume":"28","author":"P Shen","year":"2020","unstructured":"Shen, P., Lu, X., Li, S., & Kawai, H. (2020). Knowledge distillation-based representation learning for short-utterance spoken language identification. IEEE\/ACM Transactions on Audio Speech and Language, 28, 2674\u20132683.","journal-title":"IEEE\/ACM Transactions on Audio Speech and Language"},{"key":"1453_CR228","doi-asserted-by":"crossref","unstructured":"Shen, P., Lu, X., Li, S. & Kawai, H. (2019c). Interactive learning of teacher-student model for short utterance spoken language identification. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8683371"},{"key":"1453_CR229","doi-asserted-by":"crossref","unstructured":"Shen, Z., He, Z. & Xue, X. (2019d). Meal: Multi-model ensemble via adversarial learning. In AAAI.","DOI":"10.1609\/aaai.v33i01.33014886"},{"key":"1453_CR230","doi-asserted-by":"crossref","unstructured":"Shi, B., Sun, M., Kao, C. C., Rozgic, V., Matsoukas, S. & Wang, C. (2019a). Compression of acoustic event detection models with quantized distillation. In Interspeech.","DOI":"10.21437\/Interspeech.2019-1747"},{"key":"1453_CR231","doi-asserted-by":"crossref","unstructured":"Shi, B., Sun, M., Kao, CC., Rozgic, V., Matsoukas, S. & Wang, C. (2019b). Semi-supervised acoustic event detection based on tri-training. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8683710"},{"key":"1453_CR232","doi-asserted-by":"crossref","unstructured":"Shi, Y., Hwang, M. Y., Lei, X., & Sheng, H. (2019c). Knowledge distillation for recurrent neural network language modeling with trust regularization. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8683533"},{"key":"1453_CR233","unstructured":"Shin, S., Boo, Y. & Sung, W. (2019). Empirical analysis of knowledge distillation technique for optimization of quantized deep neural networks. arXiv preprint arXiv:1909.01688."},{"key":"1453_CR234","doi-asserted-by":"crossref","unstructured":"Shmelkov, K., Schmid, C., & Alahari, K. (2017). Incremental learning of object detectors without catastrophic forgetting. In ICCV.","DOI":"10.1109\/ICCV.2017.368"},{"key":"1453_CR235","unstructured":"Shu, C., Li, P., Xie, Y., Qu, Y., Dai, L., & Ma, L.(2019). Knowledge squeezed adversarial network compression. arXiv preprint arXiv:1904.05100."},{"key":"1453_CR236","doi-asserted-by":"crossref","unstructured":"Siam, M., Jiang, C., Lu, S., Petrich, L., Gamal, M., Elhoseiny, M., et\u00a0al. (2019). Video object segmentation using teacher-student adaptation in a human robot interaction (HRI) setting. In ICRA.","DOI":"10.1109\/ICRA.2019.8794254"},{"key":"1453_CR237","unstructured":"Sindhwani, V., Sainath, T. & Kumar, S. (2015). Structured transforms for small-footprint deep learning. In NeurIPS."},{"key":"1453_CR238","unstructured":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., Van Den Driessche, G., & Dieleman, S. (2016). Mastering the game of Go with deep neural networks and tree search. Nature, 529(7587), 484\u2013489."},{"key":"1453_CR239","doi-asserted-by":"crossref","unstructured":"Song, X., Feng, F., Han, X., Yang, X., Liu, W. & Nie, L. (2018). Neural compatibility modeling with attentive knowledge distillation. In SIGIR.","DOI":"10.1145\/3209978.3209996"},{"key":"1453_CR240","unstructured":"Srinivas, S. & Fleuret, F. (2018). Knowledge transfer with jacobian matching. In ICML."},{"key":"1453_CR241","doi-asserted-by":"crossref","unstructured":"Su, J. C. & Maji, S. (2017). Adapting models to signal degradation using distillation. In BMVC.","DOI":"10.5244\/C.31.21"},{"key":"1453_CR242","unstructured":"Sun, L., Gou, J., Yu, B., Du, L., & Tao, D. (2021) Collaborative teacher\u2013student learning via multiple knowledge transfer. arXiv\u00a0preprint arXiv:2101.08471."},{"key":"1453_CR243","doi-asserted-by":"crossref","unstructured":"Sun, S., Cheng, Y., Gan, Z. & Liu, J. (2019). Patient knowledge distillation for bert model compression. In NEMNLP-IJCNLP.","DOI":"10.18653\/v1\/D19-1441"},{"key":"1453_CR244","unstructured":"Sun, P., Feng, W., Han, R., Yan, S., & Wen, Y. (2019). Optimizing network performance for distributed dnn training on gpu clusters: Imagenet\/alexnet training in 1.5 minutes. arXiv preprint arXiv:1902.06855."},{"key":"1453_CR245","doi-asserted-by":"crossref","unstructured":"Takashima, R., Li, S. & Kawai, H. (2018). An investigation of a knowledge distillation method for CTC acoustic models. In ICASSP.","DOI":"10.1109\/ICASSP.2018.8461995"},{"key":"1453_CR246","first-page":"1275","volume":"30","author":"H Tan","year":"2021","unstructured":"Tan, H., Liu, X., Liu, M., Yin, B., & Li, X. (2021). KT-GAN: Knowledge-transfer generative adversarial network for text-to-image synthesis. IEEE TIP, 30, 1275\u20131290.","journal-title":"IEEE TIP"},{"key":"1453_CR247","doi-asserted-by":"crossref","unstructured":"Tan, M., Chen, B., Pang, R., Vasudevan, V., Sandler, M., Howard, A., & Le, Q. V. (2019). Mnasnet: Platform-aware neural architecture search for mobile. In CVPR.","DOI":"10.1109\/CVPR.2019.00293"},{"key":"1453_CR248","unstructured":"Tan, M., & Le, Q. (2019). EfficientNet: Rethinking model scaling for convolutional neural networks. In ICML."},{"key":"1453_CR249","unstructured":"Tan, X., Ren, Y., He, D., Qin, T., Zhao, Z. & Liu, T. Y. (2019). Multilingual neural machine translation with knowledge distillation. In ICLR."},{"key":"1453_CR250","unstructured":"Tang, J., Shivanna, R., Zhao, Z., Lin, D., Singh, A., Chi, E. H., & Jain, S. (2020). Understanding and improving knowledge distillation. arXiv preprint arXiv:2002.03532."},{"key":"1453_CR251","doi-asserted-by":"crossref","unstructured":"Tang, J., & Wang, K. (2018). Ranking distillation: Learning compact ranking models with high performance for recommender system. In SIGKDD.","DOI":"10.1145\/3219819.3220021"},{"key":"1453_CR252","unstructured":"Tang, R., Lu, Y., Liu, L., Mou, L., Vechtomova, O. & Lin, J. (2019). Distilling task-specific knowledge from bert into simple neural networks. arXiv preprint arXiv:1903.12136."},{"key":"1453_CR253","unstructured":"Tarvainen, A., & Valpola, H. (2017). Mean teachers are better role models: Weight-averaged consistency targets improve semi-supervised deep learning results. In NeurIPS."},{"key":"1453_CR254","doi-asserted-by":"crossref","unstructured":"Thoker, F. M. & Gall, J. (2019). Cross-modal knowledge distillation for action recognition. In ICIP.","DOI":"10.1109\/ICIP.2019.8802909"},{"key":"1453_CR255","unstructured":"Tian, Y., Krishnan, D. & Isola, P. (2020). Contrastive representation distillation. In ICLR."},{"key":"1453_CR256","unstructured":"Tu, Z., He, F., & Tao, D. (2020). Understanding generalization in recurrent neural networks. In International conference on learning representations. ICLR."},{"key":"1453_CR257","doi-asserted-by":"crossref","unstructured":"Tung, F., & Mori, G. (2019). Similarity-preserving knowledge distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00145"},{"key":"1453_CR258","unstructured":"Turc, I., Chang, M. W., Lee, K. & Toutanova, K. (2019). Well-read students learn better: The impact of student initialization on knowledge distillation. arXiv preprint arXiv:1908.08962."},{"key":"1453_CR259","unstructured":"Urban, G., Geras, K. J., Kahou, S. E., Aslan, O., Wang, S., Caruana, R., (2017). Do deep convolutional nets really need to be deep and convolutional? In ICLR."},{"issue":"1","key":"1453_CR260","first-page":"2023","volume":"16","author":"V Vapnik","year":"2015","unstructured":"Vapnik, V., & Izmailov, R. (2015). Learning using privileged information: Similarity control and knowledge transfer. Journal of Machine Learning Research, 16(1), 2023\u20132049.","journal-title":"Journal of Machine Learning Research"},{"key":"1453_CR261","doi-asserted-by":"crossref","unstructured":"Vongkulbhisal, J., Vinayavekhin, P. & Visentini-Scarzanella, M. (2019). Unifying heterogeneous classifiers with distillation. In CVPR.","DOI":"10.1109\/CVPR.2019.00329"},{"key":"1453_CR262","doi-asserted-by":"crossref","unstructured":"Walawalkar, D., Shen, Z., & Savvides, M. (2020). Online ensemble model compression using knowledge distillation. In ECCV.","DOI":"10.1007\/978-3-030-58529-7_2"},{"key":"1453_CR263","unstructured":"Wang, C., Lan, X. & Zhang, Y. (2017). Model distillation with knowledge transfer from face classification to alignment and verification. arXiv preprint arXiv:1709.02929."},{"key":"1453_CR264","unstructured":"Wang, L., & Yoon, K. J. (2020). Knowledge distillation and student-teacher learning for visual intelligence: A review and new outlooks. arXiv preprint arXiv:2004.05937."},{"key":"1453_CR265","doi-asserted-by":"crossref","unstructured":"Wang, H., Zhao, H., Li, X. & Tan, X. (2018a). Progressive blockwise knowledge distillation for neural network acceleration. In IJCAI.","DOI":"10.24963\/ijcai.2018\/384"},{"key":"1453_CR266","doi-asserted-by":"crossref","unstructured":"Wang, J., Bao, W., Sun, L., Zhu, X., Cao, B., & Philip, S. Y. (2019a). Private model compression via knowledge distillation. In AAAI.","DOI":"10.1609\/aaai.v33i01.33011190"},{"issue":"6","key":"1453_CR267","first-page":"2168","volume":"25","author":"J Wang","year":"2019","unstructured":"Wang, J., Gou, L., Zhang, W., Yang, H., & Shen, H. W. (2019b). Deepvid: Deep visual interpretation and diagnosis for image classifiers via knowledge distillation. TVCG, 25(6), 2168\u20132180.","journal-title":"TVCG"},{"key":"1453_CR268","doi-asserted-by":"crossref","unstructured":"Wang, M., Liu, R., Abe, N., Uchida, H., Matsunami, T., & Yamada, S. (2018b). Discover the effective strategy for face recognition model compression by improved knowledge distillation. In ICIP.","DOI":"10.1109\/ICIP.2018.8451808"},{"key":"1453_CR269","doi-asserted-by":"crossref","unstructured":"Wang, M., Liu, R., Hajime, N., Narishige, A., Uchida, H. & Matsunami, T.(2019c). Improved knowledge distillation for training fast low resolution face recognition model. In ICCVW.","DOI":"10.1109\/ICCVW.2019.00324"},{"key":"1453_CR270","doi-asserted-by":"crossref","unstructured":"Wang, T., Yuan, L., Zhang, X. & Feng, J. (2019d). Distilling object detectors with fine-grained feature imitation. In CVPR.","DOI":"10.1109\/CVPR.2019.00507"},{"key":"1453_CR271","unstructured":"Wang, T., Zhu, J. Y., Torralba, A., & Efros, A. A. (2018c). Dataset distillation. arXiv preprint arXiv:1811.10959."},{"key":"1453_CR272","doi-asserted-by":"crossref","unstructured":"Wang, W., Wei, F., Dong, L., Bao, H., Yang, N., & Zhou, M. (2020a). Minilm: Deep self-attention distillation for task-agnostic compression of pre-trained transformers. In NeurIPS.","DOI":"10.18653\/v1\/2021.findings-acl.188"},{"key":"1453_CR273","doi-asserted-by":"crossref","unstructured":"Wang, W., Zhang, J., Zhang, H., Hwang, M. Y., Zong, C. & Li, Z. (2018d). A teacher-student framework for maintainable dialog manager. In EMNLP.","DOI":"10.18653\/v1\/D18-1415"},{"key":"1453_CR274","doi-asserted-by":"crossref","unstructured":"Wang, X., Fu, T., Liao, S., Wang, S., Lei, Z., & Mei, T. (2020b). Exclusivity-consistency regularized knowledge distillation for face recognition. In ECCV.","DOI":"10.1007\/978-3-030-58586-0_20"},{"key":"1453_CR275","doi-asserted-by":"crossref","unstructured":"Wang, X., Hu, J. F., Lai, J. H., Zhang, J. & Zheng, W. S. (2019e). Progressive teacher-student learning for early action prediction. In CVPR.","DOI":"10.1109\/CVPR.2019.00367"},{"key":"1453_CR276","unstructured":"Wang, X., Zhang, R., Sun, Y. & Qi, J. (2018e) Kdgan: Knowledge distillation with generative adversarial networks. In NeurIPS."},{"issue":"10","key":"1453_CR277","doi-asserted-by":"publisher","first-page":"2495","DOI":"10.1109\/TPAMI.2018.2857824","volume":"41","author":"Y Wang","year":"2019","unstructured":"Wang, Y., Xu, C., Xu, C., & Tao, D. (2019f). Packing convolutional neural networks in the frequency domain. IEEE TPAMI, 41(10), 2495\u20132510.","journal-title":"IEEE TPAMI"},{"key":"1453_CR278","doi-asserted-by":"crossref","unstructured":"Wang, Y., Xu, C., Xu, C. & Tao, D. (2018f). Adversarial learning of portable student networks. In AAAI.","DOI":"10.1609\/aaai.v32i1.11667"},{"key":"1453_CR279","doi-asserted-by":"publisher","first-page":"107722","DOI":"10.1016\/j.patcog.2020.107722","volume":"111","author":"ZR Wang","year":"2021","unstructured":"Wang, Z. R., & Du, J. (2021). Joint architecture and knowledge distillation in CNN for Chinese text recognition. Pattern Recognition, 111, 107722.","journal-title":"Pattern Recognition"},{"key":"1453_CR280","doi-asserted-by":"crossref","unstructured":"Watanabe, S., Hori, T., Le\u00a0Roux, J. & Hershey, J. R. (2017). Student-teacher network learning with enhanced features. In ICASSP.","DOI":"10.1109\/ICASSP.2017.7953163"},{"key":"1453_CR281","doi-asserted-by":"crossref","unstructured":"Wei, H. R., Huang, S., Wang, R., Dai, X. & Chen, J. (2019). Online distilling from checkpoints for neural machine translation. In NAACL-HLT.","DOI":"10.18653\/v1\/N19-1192"},{"key":"1453_CR282","doi-asserted-by":"crossref","unstructured":"Wei, Y., Pan, X., Qin, H., Ouyang, W. & Yan, J. (2018). Quantization mimic: Towards very tiny CNN for object detection. In ECCV.","DOI":"10.1007\/978-3-030-01237-3_17"},{"key":"1453_CR283","doi-asserted-by":"crossref","unstructured":"Wong, J. H. & Gales, M. (2016). Sequence student-teacher training of deep neural networks. In Interspeech.","DOI":"10.21437\/Interspeech.2016-911"},{"key":"1453_CR284","doi-asserted-by":"crossref","unstructured":"Wu, B., Dai, X., Zhang, P., Wang, Y., Sun, F., Wu, Y., et al. (2019). Fbnet: Hardware-aware efficient convnet design via differentiable neural architecture search. In CVPR.","DOI":"10.1109\/CVPR.2019.01099"},{"key":"1453_CR285","doi-asserted-by":"crossref","unstructured":"Wu, A., Zheng, W. S., Guo, X. & Lai, J. H. (2019a). Distilled person re-identification: Towards a more scalable system. In CVPR.","DOI":"10.1109\/CVPR.2019.00128"},{"key":"1453_CR286","doi-asserted-by":"crossref","unstructured":"Wu, G., & Gong, S. (2021). Peer collaborative learning for online knowledge distillation. In AAAI.","DOI":"10.1609\/aaai.v35i12.17234"},{"key":"1453_CR287","doi-asserted-by":"crossref","unstructured":"Wu, J., Leng, C., Wang, Y., Hu, Q. & Cheng, J. (2016). Quantized convolutional neural networks for mobile devices. In CVPR.","DOI":"10.1109\/CVPR.2016.521"},{"key":"1453_CR288","doi-asserted-by":"crossref","unstructured":"Wu, M. C., Chiu, C. T. & Wu, K. H. (2019b). Multi-teacher knowledge distillation for compressed video action recognition on deep neural networks. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8682450"},{"key":"1453_CR289","doi-asserted-by":"crossref","unstructured":"Wu, X., He, R., Hu, Y., & Sun, Z. (2020). Learning an evolutionary embedding via massive knowledge distillation. International Journal of Computer Vision, 1\u201318.","DOI":"10.1007\/s11263-019-01286-x"},{"issue":"11","key":"1453_CR290","first-page":"2063","volume":"31","author":"S Xia","year":"2018","unstructured":"Xia, S., Wang, G., Chen, Z., & Duan, Y. (2018). Complete random forest based class noise filtering learning for improving the generalizability of classifiers. IEEE TKDE, 31(11), 2063\u20132078.","journal-title":"IEEE TKDE"},{"key":"1453_CR291","unstructured":"Xie, J., Lin, S., Zhang, Y. & Luo, L. (2019). Training convolutional neural networks with cheap convolutions and online distillation. arXiv preprint arXiv:1909.13063."},{"key":"1453_CR292","doi-asserted-by":"crossref","unstructured":"Xie, Q., Hovy, E., Luong, M. T., & Le, Q. V. (2020). Self-training with Noisy Student improves ImageNet classification. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01070"},{"key":"1453_CR293","doi-asserted-by":"crossref","unstructured":"Xu, G., Liu, Z., Li, X., & Loy, C. C. (2020a). Knowledge distillation meets self-supervision. In ECCV.","DOI":"10.1007\/978-3-030-58545-7_34"},{"key":"1453_CR294","doi-asserted-by":"crossref","unstructured":"Xu, K., Rui, L., Li, Y., & Gu, L. (2020b). Feature normalized knowledge distillation for image classification. In ECCV.","DOI":"10.1007\/978-3-030-58595-2_40"},{"key":"1453_CR295","unstructured":"Xu, Z., Wu, K., Che, Z., Tang, J., & Ye, J. (2020c). Knowledge transfer in multi-task deep reinforcement learning for continuous control. In NeurIPS."},{"key":"1453_CR296","unstructured":"Xu, Z., Hsu, Y. C. & Huang, J. (2018a). Training shallow and thin networks for acceleration via knowledge distillation with conditional adversarial networks. In ICLR workshop."},{"key":"1453_CR297","unstructured":"Xu, Z., Hsu, Y. C. & Huang, J. (2018b). Training student networks for acceleration with conditional adversarial networks. In BMVC."},{"key":"1453_CR298","doi-asserted-by":"crossref","unstructured":"Xu, T. B., & Liu, C. L. (2019). Data-distortion guided self-distillation for deep neural networks. In AAAI.","DOI":"10.1609\/aaai.v33i01.33015565"},{"key":"1453_CR299","doi-asserted-by":"crossref","unstructured":"Yan, M., Zhao, M., Xu, Z., Zhang, Q., Wang, G. & Su, Z. (2019). Vargfacenet: An efficient variable group convolutional neural network for lightweight face recognition. In ICCVW.","DOI":"10.1109\/ICCVW.2019.00323"},{"key":"1453_CR300","doi-asserted-by":"crossref","unstructured":"Yang, C., Xie, L., Qiao, S. & Yuille, A. (2019a). Knowledge distillation in generations: More tolerant teachers educate better students. In AAAI.","DOI":"10.1609\/aaai.v33i01.33015628"},{"key":"1453_CR301","doi-asserted-by":"crossref","unstructured":"Yang, C., Xie, L., Su, C. & Yuille, A. L. (2019b). Snapshot distillation: Teacher-student optimization in one generation. In CVPR.","DOI":"10.1109\/CVPR.2019.00297"},{"key":"1453_CR302","unstructured":"Yang, J., Martinez, B., Bulat, A., & Tzimiropoulos, G. (2020a). Knowledge distillation via adaptive instance normalization. In ECCV."},{"key":"1453_CR303","doi-asserted-by":"crossref","unstructured":"Yang, Y., Qiu, J., Song, M., Tao, D. & Wang, X. (2020b). Distilling knowledge from graph convolutional networks. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00710"},{"key":"1453_CR304","doi-asserted-by":"crossref","unstructured":"Yang, Z., Shou, L., Gong, M., Lin, W. & Jiang, D. (2020c). Model compression with two-stage multi-teacher knowledge distillation for web question answering system. In WSDM.","DOI":"10.1145\/3336191.3371792"},{"key":"1453_CR305","doi-asserted-by":"crossref","unstructured":"Yao, A., & Sun, D. (2020). Knowledge transfer via dense cross-layer mutual-distillation. In ECCV.","DOI":"10.1007\/978-3-030-58555-6_18"},{"key":"1453_CR306","doi-asserted-by":"crossref","unstructured":"Yao, H., Zhang, C., Wei, Y., Jiang, M., Wang, S., Huang, J., Chawla, N. V., & Li, Z. (2020). Graph few-shot learning via knowledge transfer. In AAAI.","DOI":"10.1609\/aaai.v34i04.6142"},{"key":"1453_CR307","doi-asserted-by":"crossref","unstructured":"Ye, J., Ji, Y., Wang, X., Gao, X., & Song, M. (2020). Data-free knowledge amalgamation via group-stack dual-GAN. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01253"},{"key":"1453_CR308","doi-asserted-by":"crossref","unstructured":"Ye, J., Ji, Y., Wang, X., Ou, K., Tao, D. & Song, M. (2019). Student becoming the master: Knowledge amalgamation for joint scene parsing, depth estimation, and more. In CVPR.","DOI":"10.1109\/CVPR.2019.00294"},{"key":"1453_CR309","doi-asserted-by":"crossref","unstructured":"Yim, J., Joo, D., Bae, J. & Kim, J. (2017). A gift from knowledge distillation: Fast optimization, network minimization and transfer learning. In CVPR.","DOI":"10.1109\/CVPR.2017.754"},{"key":"1453_CR310","doi-asserted-by":"crossref","unstructured":"Yin, H., Molchanov, P., Alvarez, J. M., Li, Z., Mallya, A., Hoiem, D., Jha, Niraj K., & Kautz, J. (2020). Dreaming to distill: Data-free knowledge transfer via DeepInversion. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00874"},{"key":"1453_CR311","unstructured":"Yoo, J., Cho, M., Kim, T., & Kang, U. (2019). Knowledge extraction with no observable data. In NeurIPS."},{"key":"1453_CR312","doi-asserted-by":"crossref","unstructured":"You, S., Xu, C., Xu, C., & Tao, D. (2017). Learning from multiple teacher networks. In SIGKDD.","DOI":"10.1145\/3097983.3098135"},{"key":"1453_CR313","doi-asserted-by":"crossref","unstructured":"You, S., Xu, C., Xu, C. & Tao, D. (2018). Learning with single-teacher multi-student. In AAAI.","DOI":"10.1609\/aaai.v32i1.11636"},{"key":"1453_CR314","unstructured":"You, Y., Li, J., Reddi, S., Hseu, J., Kumar, S., Bhojanapalli, S., et al. (2019). Large batch optimization for deep learning: Training bert in 76 minutes. In ICLR."},{"key":"1453_CR315","doi-asserted-by":"crossref","unstructured":"Yu, L., Yazici, V. O., Liu, X., Weijer, J., Cheng, Y. & Ramisa, A. (2019). Learning metrics from teachers: Compact networks for image embedding. In CVPR.","DOI":"10.1109\/CVPR.2019.00302"},{"key":"1453_CR316","doi-asserted-by":"crossref","unstructured":"Yu, X., Liu, T., Wang, X., & Tao, D. (2017). On compressing deep models by low rank and sparse decomposition. In CVPR.","DOI":"10.1109\/CVPR.2017.15"},{"key":"1453_CR317","doi-asserted-by":"crossref","unstructured":"Yuan, F., Shou, L., Pei, J., Lin, W., Gong, M., Fu, Y., & Jiang, D. (2021). Reinforced multi-teacher selection for knowledge distillation. In AAAI.","DOI":"10.1609\/aaai.v35i16.17680"},{"key":"1453_CR318","unstructured":"Yuan, L., Tay, F. E., Li, G., Wang, T. & Feng, J. (2020). Revisit knowledge distillation: a teacher-free framework. In CVPR."},{"issue":"8","key":"1453_CR319","first-page":"1955","volume":"22","author":"M Yuan","year":"2020","unstructured":"Yuan, M., & Peng, Y. (2020). CKD: Cross-task knowledge distillation for text-to-image synthesis. IEEE TMM, 22(8), 1955\u20131968.","journal-title":"IEEE TMM"},{"key":"1453_CR320","doi-asserted-by":"crossref","unstructured":"Yue, K., Deng, J., & Zhou, F. (2020). Matching guided distillation. In ECCV.","DOI":"10.1007\/978-3-030-58555-6_19"},{"key":"1453_CR321","doi-asserted-by":"crossref","unstructured":"Yun, S., Park, J., Lee, K. & Shin, J. (2020). Regularizing class-wise predictions via self-knowledge distillation. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01389"},{"key":"1453_CR322","unstructured":"Zagoruyko, S. & Komodakis, N. (2017). Paying more attention to attention: Improving the performance of convolutional neural networks via attention transfer. In ICLR."},{"key":"1453_CR323","doi-asserted-by":"crossref","unstructured":"Zhai, M., Chen, L., Tung, F., He, J., Nawhal, M. & Mori, G. (2019). Lifelong gan: Continual learning for conditional image generation. In ICCV.","DOI":"10.1109\/ICCV.2019.00285"},{"key":"1453_CR324","unstructured":"Zhai, S., Cheng, Y., Zhang, Z. M. & Lu, W. (2016). Doubly convolutional neural networks. In NeurIPS."},{"key":"1453_CR325","unstructured":"Zhao, C., & Hospedales, T. (2020). Robust domain randomised reinforcement learning through peer-to-peer distillation. In NeurIPS."},{"key":"1453_CR326","doi-asserted-by":"publisher","unstructured":"Zhao, H., Sun, X., Dong, J., Chen, C., & Dong, Z. (2020a). Highlight every step: Knowledge distillation via collaborative teaching. IEEE TCYB. https:\/\/doi.org\/10.1109\/TCYB.2020.3007506.","DOI":"10.1109\/TCYB.2020.3007506"},{"key":"1453_CR327","doi-asserted-by":"crossref","unstructured":"Zhao, L., Peng, X., Chen, Y., Kapadia, M., & Metaxas, D. N. (2020b). Knowledge as Priors: Cross-Modal Knowledge Generalization for Datasets without Superior Knowledge. In CVPR.","DOI":"10.1109\/CVPR42600.2020.00656"},{"key":"1453_CR328","doi-asserted-by":"crossref","unstructured":"Zhao, M., Li, T., Abu\u00a0Alsheikh, M., Tian, Y., Zhao, H., Torralba, A. & Katabi, D. (2018). Through-wall human pose estimation using radio signals. In CVPR.","DOI":"10.1109\/CVPR.2018.00768"},{"key":"1453_CR329","doi-asserted-by":"crossref","unstructured":"Zhang, C. & Peng, Y. (2018). Better and faster: knowledge transfer from multiple self-supervised learning tasks via graph distillation for video classification. In IJCAI.","DOI":"10.24963\/ijcai.2018\/158"},{"key":"1453_CR330","doi-asserted-by":"crossref","unstructured":"Zhang, F., Zhu, X. & Ye, M. (2019a). Fast human pose estimation. In CVPR.","DOI":"10.1109\/CVPR.2019.00363"},{"key":"1453_CR331","unstructured":"Zhang, J., Liu, T., & Tao, D. (2018). An information-theoretic view for deep learning. arXiv preprint arXiv:1804.09060."},{"key":"1453_CR332","doi-asserted-by":"publisher","first-page":"107659","DOI":"10.1016\/j.patcog.2020.107659","volume":"111","author":"H Zhang","year":"2021","unstructured":"Zhang, H., Hu, Z., Qin, W., Xu, M., & Wang, M. (2021a). Adversarial co-distillation learning for image recognition. Pattern Recognition, 111, 107659.","journal-title":"Pattern Recognition"},{"key":"1453_CR333","unstructured":"Zhang, L., Shi, Y., Shi, Z., Ma, K., & Bao, C. (2020a). Task-oriented feature distillation. In NeurIPS."},{"key":"1453_CR334","doi-asserted-by":"crossref","unstructured":"Zhang, L., Song, J., Gao, A., Chen, J., Bao, C. & Ma, K. (2019b). Be your own teacher: Improve the performance of convolutional neural networks via self distillation. In ICCV.","DOI":"10.1109\/ICCV.2019.00381"},{"key":"1453_CR335","doi-asserted-by":"crossref","unstructured":"Zhang, M., Song, G., Zhou, H., & Liu, Y. (2020b). Discriminability distillation in group representation learning. In ECCV.","DOI":"10.1007\/978-3-030-58607-2_1"},{"key":"1453_CR336","doi-asserted-by":"crossref","unstructured":"Zhang, S., Feng, Y., & Li, L. (2021b). Future-guided incremental transformer for simultaneous translation. In AAAI.","DOI":"10.1609\/aaai.v35i16.17696"},{"key":"1453_CR337","doi-asserted-by":"crossref","unstructured":"Zhang, S., Guo, S., Wang, L., Huang, W., & Scott, M. R. (2020c). Knowledge integration networks for action recognition. In AAAI.","DOI":"10.1609\/aaai.v34i07.6983"},{"key":"1453_CR338","doi-asserted-by":"crossref","unstructured":"Zhang, W., Miao, X., Shao, Y., Jiang, J., Chen, L., Ruas, O., & Cui, B. (2020d). Reliable data distillation on graph convolutional network. In ACM SIGMOD.","DOI":"10.1145\/3318464.3389706"},{"key":"1453_CR339","doi-asserted-by":"crossref","unstructured":"Zhang, X., Wang, X., Bian, J. W., Shen, C., & You, M. (2021c). Diverse knowledge distillation for end-to-end person search. In AAAI.","DOI":"10.1609\/aaai.v35i4.16454"},{"key":"1453_CR340","doi-asserted-by":"crossref","unstructured":"Zhang, X., Zhou, X., Lin, M. & Sun, J. (2018a). Shufflenet: An extremely efficient convolutional neural network for mobile devices. In CVPR.","DOI":"10.1109\/CVPR.2018.00716"},{"key":"1453_CR341","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Lan, Z., Dai, Y., Zeng, F., Bai, Y., Chang, J., & Wei, Y. (2020e). Prime-aware adaptive distillation. In ECCV.","DOI":"10.1007\/978-3-030-58529-7_39"},{"key":"1453_CR342","doi-asserted-by":"crossref","unstructured":"Zhang, Y., Xiang, T., Hospedales, T. M. & Lu, H. (2018b). Deep mutual learning. In CVPR.","DOI":"10.1109\/CVPR.2018.00454"},{"key":"1453_CR343","unstructured":"Zhang, Z., & Sabuncu, M. R. (2020). Self-distillation as instance-specific label smoothing. In NeurIPS."},{"key":"1453_CR344","doi-asserted-by":"crossref","unstructured":"Zhang, Z., Shi, Y., Yuan, C., Li, B., Wang, P., Hu, W., & Zha, Z. J. (2020f). Object relational graph with teacher-recommended learning for video captioning. In CVPR.","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"1453_CR345","unstructured":"Zhou C, Neubig G, Gu J (2019a) Understanding knowledge distillation in non-autoregressive machine translation. In ICLR."},{"key":"1453_CR346","doi-asserted-by":"crossref","unstructured":"Zhou, G., Fan, Y., Cui, R., Bian, W., Zhu, X. & Gai, K. (2018). Rocket launching: A universal and efficient framework for training well-performing light net. In AAAI.","DOI":"10.1609\/aaai.v32i1.11601"},{"key":"1453_CR347","unstructured":"Zhou, J., Zeng, S. & Zhang, B. (2019b) Two-stage image classification supervised by a single teacher single student model. In BMVC."},{"key":"1453_CR348","unstructured":"Zhou, P., Mai, L., Zhang, J., Xu, N., Wu, Z. & Davis, L. S. (2020). M2KD: Multi-model and multi-level knowledge distillation for incremental learning. In BMVC."},{"key":"1453_CR349","doi-asserted-by":"crossref","unstructured":"Zhu, M., Han, K., Zhang, C., Lin, J., & Wang, Y. (2019). Low-resolution visual recognition via deep feature distillation. In ICASSP.","DOI":"10.1109\/ICASSP.2019.8682926"},{"key":"1453_CR350","unstructured":"Zhu, X., & Gong, S. (2018). Knowledge distillation by on-the-fly native ensemble. In NeurIPS."}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-021-01453-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11263-021-01453-z\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-021-01453-z.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,8,26]],"date-time":"2024-08-26T15:26:32Z","timestamp":1724685992000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11263-021-01453-z"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,3,22]]},"references-count":350,"journal-issue":{"issue":"6","published-print":{"date-parts":[[2021,6]]}},"alternative-id":["1453"],"URL":"https:\/\/doi.org\/10.1007\/s11263-021-01453-z","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021,3,22]]},"assertion":[{"value":"29 June 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 March 2021","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 March 2021","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}