{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,9,8]],"date-time":"2025-09-08T05:47:37Z","timestamp":1757310457803,"version":"3.40.3"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030930455"},{"type":"electronic","value":"9783030930462"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-93046-2_52","type":"book-chapter","created":{"date-parts":[[2022,1,1]],"date-time":"2022-01-01T05:30:01Z","timestamp":1641015001000},"page":"613-625","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Revisiting Knowledge Distillation for Image Captioning"],"prefix":"10.1007","author":[{"given":"Jingjing","family":"Dong","sequence":"first","affiliation":[]},{"given":"Zhenzhen","family":"Hu","sequence":"additional","affiliation":[]},{"given":"Yuanen","family":"Zhou","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2022,1,1]]},"reference":[{"key":"52_CR1","unstructured":"Vaswani, A., et al.: Attention is all you need. In arXiv (2017)"},{"key":"52_CR2","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., et al.: Microsoft coco: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"52_CR3","doi-asserted-by":"crossref","unstructured":"Papineni, K., et al.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"52_CR4","doi-asserted-by":"crossref","unstructured":"Vinyals, O., et al.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"52_CR5","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"52_CR6","unstructured":"Hinton, G.E., et al.: Distilling the knowledge in a neural network. In arXiv (2015)"},{"key":"52_CR7","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"52_CR8","unstructured":"Lin, C.-Y.: Rouge: a package for automatic evaluation of summaries. In: Text Summarization Branches Out (2004)"},{"key":"52_CR9","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence, Z.C., Parikh, D. Cider: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"52_CR10","doi-asserted-by":"crossref","unstructured":"Zhang, Y., et al.: Deep mutual learning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00454"},{"key":"52_CR11","doi-asserted-by":"crossref","unstructured":"Kim, Y., Rush, A.M.: Sequence-level knowledge distillation. In arXiv (2016)","DOI":"10.18653\/v1\/D16-1139"},{"key":"52_CR12","doi-asserted-by":"crossref","unstructured":"Zhou, L., et al.: Unified vision-language pre-training for image captioning and vqa. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"52_CR13","doi-asserted-by":"crossref","unstructured":"Zhang, L., et al.: Be your own teacher: improve the performance of convolutional neural networks via self distillation. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00381"},{"key":"52_CR14","doi-asserted-by":"crossref","unstructured":"Zhou, Y., et al.: More grounded image captioning by distilling image-text matching model. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00483"},{"key":"52_CR15","doi-asserted-by":"crossref","unstructured":"Pan, Y., et al.: X-linear attention networks for image captioning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"52_CR16","doi-asserted-by":"crossref","unstructured":"Denkowski, M., Alon, L.: Meteor universal: language specific translation evaluation for any target language. In: Proceedings of the Ninth Workshop on Statistical Machine Translation, pp. 376\u2013380 (2014)","DOI":"10.3115\/v1\/W14-3348"},{"key":"52_CR17","doi-asserted-by":"crossref","unstructured":"Hahn, S., Choi, H.: Self-knowledge distillation in natural language processing. In arXiv (2019)","DOI":"10.26615\/978-954-452-056-4_050"},{"key":"52_CR18","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., et al.: Self-critical sequence training for image captioning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"52_CR19","doi-asserted-by":"crossref","unstructured":"Chen, Y.-C., et al.: Distilling knowledge learned in BERT for text generation. In arXiv (2019)","DOI":"10.18653\/v1\/2020.acl-main.705"},{"key":"52_CR20","unstructured":"Dognin, P.L., et al.: Alleviating noisy data in image captioning with cooperative distillation. In: arXiv (2020)"},{"key":"52_CR21","doi-asserted-by":"crossref","unstructured":"Guo, L., et al.: Non-autoregressive image captioning with counterfactuals-critical multi-agent learning. In arXiv (2020)","DOI":"10.24963\/ijcai.2020\/107"},{"key":"52_CR22","doi-asserted-by":"crossref","unstructured":"Zhang, Z., et al.: Object relational graph with teacher-recommended learning for video captioning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01329"},{"key":"52_CR23","doi-asserted-by":"crossref","unstructured":"Pan, B., et al.: Spatio-temporal graph for video captioning with knowledge distillation. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01088"},{"key":"52_CR24","doi-asserted-by":"crossref","unstructured":"Plummer, B.A., et al.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"52_CR25","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: SPICE: Semantic Propositional Image Caption Evaluation. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"52_CR26","doi-asserted-by":"crossref","unstructured":"Yuan, L., et al.: Revisiting knowledge distillation via label smoothing regularization. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.00396"},{"key":"52_CR27","doi-asserted-by":"crossref","unstructured":"Li, J., et al.: Learning to learn from noisy labeled data. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00519"},{"key":"52_CR28","doi-asserted-by":"crossref","unstructured":"He, Y.-Y., Jianxin, W., Wei, X.-S.: Distilling virtual examples for long-tailed recognition. In arXiv (2021)","DOI":"10.1109\/ICCV48922.2021.00030"},{"key":"52_CR29","unstructured":"Furlanello, T., et al.: Born again neural networks. In: ICML (2018)"},{"key":"52_CR30","unstructured":"Devlin, J., et al.: Bert: pre-training of deep bidirectional transformers for language understanding. In arXiv (2018)"},{"key":"52_CR31","unstructured":"Dhar, G.K.V.P.S., et al.: Baby Talk: Understanding and Generating Simple Image Descriptions (2013)"},{"key":"52_CR32","unstructured":"Mitchell, M., et al.: Midge: generating image descriptions from computer vision detections. In: ECACL (2012)"},{"key":"52_CR33","doi-asserted-by":"crossref","unstructured":"Huang, L., et al.: Attention on attention for image captioning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"52_CR34","doi-asserted-by":"crossref","unstructured":"Cornia, M., et al.: Meshed-memory transformer for image captioning. In: CVPR (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"52_CR35","doi-asserted-by":"crossref","unstructured":"Chen, D., Mei, J.P., Wang, C., Feng, Y., Chen, C.: Online knowledge distillation with diverse peers. In: AAAI (2020)","DOI":"10.1609\/aaai.v34i04.5746"},{"key":"52_CR36","unstructured":"Yuan, L., Tay, F.E., Li, G., Wang, T., Feng, J.: Revisit knowledge distillation: a teacher-free framework. In: CVPR (2020)"},{"key":"52_CR37","unstructured":"Huang, Z., Wang, N.: Like what you like: Knowledge distill via neuron selectivity transfer. In arXiv (2017)"},{"key":"52_CR38","doi-asserted-by":"crossref","unstructured":"Wei, H.R., Huang, S., Wang, R., Dai, X., Chen, J.: Online distilling from checkpoints for neural machine translation. In: NAACL-HLT (2019)","DOI":"10.18653\/v1\/N19-1192"},{"key":"52_CR39","doi-asserted-by":"crossref","unstructured":"Freitag, M., Al-Onaizan, Y., Sankaran, B.: Ensemble distillation for neural machine translation. In arXiv (2017)","DOI":"10.18653\/v1\/W17-3207"},{"key":"52_CR40","unstructured":"Su, W., et al.: VL-BERT: pre-training of generic visual-linguistic representations. In arXiv (2019)"},{"key":"52_CR41","unstructured":"Lu, J., et al.: ViLBERT: pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. In arXiv (2019)"},{"key":"52_CR42","unstructured":"Li, L.H., et al.: VisualBERT: a simple and performant baseline for vision and language. In arXiv (2019)"},{"key":"52_CR43","unstructured":"Kim, J., Park, S.U.K., Kwak, N.: Paraphrasing complex network: network compression via factor transfer. In arXiv (2018)"},{"key":"52_CR44","unstructured":"Romero, A., et al.: Fitnets: hints for thin deep nets. In arXiv (2014)"},{"key":"52_CR45","unstructured":"Zagoruyko, S., Nikos, K.: Paying more attention to attention: improving the performance of convolutional neural networks via attention transfer. In arXiv (2016)"},{"key":"52_CR46","doi-asserted-by":"crossref","unstructured":"Park, W., et al.: Relational knowledge distillation. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00409"},{"key":"52_CR47","unstructured":"Chen, H., et al.: Learning student networks via feature embedding. IEEE TNNLS (2020)"},{"key":"52_CR48","unstructured":"Xie, J., et al.: Training convolutional neural networks with cheap convolutions and online distillation. In arXiv (2019)"},{"key":"52_CR49","unstructured":"Bagherinezhad, H., et al.: Label refinery: improving imagenet classification through label progression. In arXiv (2018)"},{"key":"52_CR50","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Li, F.-F.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"52_CR51","unstructured":"Kingma, D.P., Jimmy, B.: Adam: a method for stochastic optimization. In arXiv (2014)"},{"key":"52_CR52","doi-asserted-by":"crossref","unstructured":"Sharma, P., et al.: Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: ACL (2018)","DOI":"10.18653\/v1\/P18-1238"},{"key":"52_CR53","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Rethinking the inception architecture for computer vision. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.308"}],"container-title":["Lecture Notes in Computer Science","Artificial Intelligence"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-93046-2_52","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,6,18]],"date-time":"2022-06-18T08:09:25Z","timestamp":1655539765000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-93046-2_52"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030930455","9783030930462"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-93046-2_52","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"1 January 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CICAI","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CAAI International Conference on Artificial Intelligence","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"5 June 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 June 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cicai2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/cicai.caai.cn\/#\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"307","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"105","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"34% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}