{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,29]],"date-time":"2026-04-29T22:43:55Z","timestamp":1777502635891,"version":"3.51.4"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer Nature Switzerland","isbn-type":[{"value":"9783031533075","type":"print"},{"value":"9783031533082","type":"electronic"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53308-2_23","type":"book-chapter","created":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T21:37:36Z","timestamp":1706391456000},"page":"310-324","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Lightweight Image Captioning Model Based on\u00a0Knowledge Distillation"],"prefix":"10.1007","author":[{"given":"Zhenlei","family":"Cui","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Zhenhua","family":"Tang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianze","family":"Li","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kai","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2024,1,28]]},"reference":[{"key":"23_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"382","DOI":"10.1007\/978-3-319-46454-1_24","volume-title":"Computer Vision \u2013 ECCV 2016","author":"P Anderson","year":"2016","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9909, pp. 382\u2013398. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46454-1_24"},{"key":"23_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"issue":"3","key":"23_CR3","doi-asserted-by":"publisher","first-page":"1638","DOI":"10.3390\/app12031638","volume":"12","author":"V Atliha","year":"2022","unstructured":"Atliha, V., \u0160e\u0161ok, D.: Image-captioning model compression. Appl. Sci. 12(3), 1638 (2022)","journal-title":"Appl. Sci."},{"key":"23_CR4","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: Proceedings of the ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"23_CR5","doi-asserted-by":"crossref","unstructured":"Barraco, M., Stefanini, M., Cornia, M., Cascianelli, S., Baraldi, L., Cucchiara, R.: CaMEL: mean teacher learning for image captioning. In: 2022 26th International Conference on Pattern Recognition (ICPR), pp. 4087\u20134094. IEEE (2022)","DOI":"10.1109\/ICPR56361.2022.9955644"},{"key":"23_CR6","doi-asserted-by":"crossref","unstructured":"Cornia, M., Baraldi, L., Cucchiara, R.: SMArT: training shallow memory-aware transformers for robotic explainability. In: 2020 IEEE International Conference on Robotics and Automation (ICRA), pp. 1128\u20131134. IEEE (2020)","DOI":"10.1109\/ICRA40945.2020.9196653"},{"key":"23_CR7","doi-asserted-by":"crossref","unstructured":"Cornia, M., Stefanini, M., Baraldi, L., Cucchiara, R.: Meshed-memory transformer for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10578\u201310587 (2020)","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"23_CR8","unstructured":"Denil, M., Shakibi, B., Dinh, L., Ranzato, M., De Freitas, N.: Predicting parameters in deep learning. In: Advances in Neural Information Processing Systems, vol. 26 (2013)"},{"key":"23_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"613","DOI":"10.1007\/978-3-030-93046-2_52","volume-title":"Artificial Intelligence CICAI 2021","author":"J Dong","year":"2021","unstructured":"Dong, J., Hu, Z., Zhou, Y.: Revisiting knowledge distillation for image captioning. In: Fang, L., Chen, Y., Zhai, G., Wang, J., Wang, R., Dong, W. (eds.) Artificial Intelligence CICAI 2021. Lecture Notes in Computer Science, vol. 13069, pp. 613\u2013625. Springer, Cham (2021)"},{"key":"23_CR10","doi-asserted-by":"crossref","unstructured":"Hahn, S., Choi, H.: Self-knowledge distillation in natural language processing. arXiv preprint arXiv:1908.01851 (2019)","DOI":"10.26615\/978-954-452-056-4_050"},{"key":"23_CR11","unstructured":"Herdade, S., Kappeler, A., Boakye, K., Soares, J.: Image captioning: transforming objects into words. In: Advances in Neural Information Processing Systems, vol. 32 (2019)"},{"key":"23_CR12","unstructured":"Hinton, G., Vinyals, O., Dean, J.: Distilling the knowledge in a neural network. arXiv preprint arXiv:1503.02531 (2015)"},{"key":"23_CR13","doi-asserted-by":"publisher","first-page":"12525","DOI":"10.1007\/s11042-020-10292-y","volume":"80","author":"HY Hsieh","year":"2021","unstructured":"Hsieh, H.Y., Huang, S.A., Leu, J.S.: Implementing a real-time image captioning service for scene identification using embedded system. Multimed. Tools Appl. 80, 12525\u201312537 (2021)","journal-title":"Multimed. Tools Appl."},{"key":"23_CR14","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp. 4634\u20134643 (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"23_CR15","unstructured":"Huang, Y., Chen, J.: Teacher-critical training strategies for image captioning. arXiv preprint arXiv:2009.14405 (2020)"},{"key":"23_CR16","doi-asserted-by":"crossref","unstructured":"Jiao, X., et al.: TinyBERT: Distilling BERT for natural language understanding. arXiv preprint arXiv:1909.10351 (2019)","DOI":"10.18653\/v1\/2020.findings-emnlp.372"},{"key":"23_CR17","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"23_CR18","doi-asserted-by":"crossref","unstructured":"Li, X., Guo, D., Liu, H., Sun, F.: Robotic indoor scene captioning from streaming video. In: 2021 IEEE International Conference on Robotics and Automation (ICRA), pp. 6109\u20136115. IEEE (2021)","DOI":"10.1109\/ICRA48506.2021.9560904"},{"key":"23_CR19","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: Text summarization branches out, pp. 74\u201381 (2004)"},{"key":"23_CR20","unstructured":"Luo, R.: A better variant of self-critical sequence training. arXiv preprint arXiv:2003.09971 (2020)"},{"key":"23_CR21","doi-asserted-by":"crossref","unstructured":"Luo, Y., et al.: Dual-level collaborative transformer for image captioning. In: Proceedings of the AAAI Conference on Artificial Intelligence, vol. 35, pp. 2286\u20132293 (2021)","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"23_CR22","doi-asserted-by":"publisher","first-page":"109420","DOI":"10.1016\/j.patcog.2023.109420","volume":"138","author":"Y Ma","year":"2023","unstructured":"Ma, Y., Ji, J., Sun, X., Zhou, Y., Ji, R.: Towards local visual modeling for image captioning. Pattern Recogn. 138, 109420 (2023)","journal-title":"Pattern Recogn."},{"key":"23_CR23","doi-asserted-by":"crossref","unstructured":"Pan, Y., Yao, T., Li, Y., Mei, T.: X-linear attention networks for image captioning. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 10971\u201310980 (2020)","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"23_CR24","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"23_CR25","unstructured":"Radford, A., et al.: Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, pp. 8748\u20138763. PMLR (2021)"},{"key":"23_CR26","doi-asserted-by":"crossref","unstructured":"Sun, S., Cheng, Y., Gan, Z., Liu, J.: Patient knowledge distillation for BERT model compression. arXiv preprint arXiv:1908.09355 (2019)","DOI":"10.18653\/v1\/D19-1441"},{"issue":"10","key":"23_CR27","doi-asserted-by":"publisher","first-page":"2686","DOI":"10.1109\/TMM.2019.2904878","volume":"21","author":"JH Tan","year":"2019","unstructured":"Tan, J.H., Chan, C.S., Chuah, J.H.: COMIC: toward a compact image captioning model with attention. IEEE Trans. Multimed. 21(10), 2686\u20132696 (2019)","journal-title":"IEEE Trans. Multimed."},{"key":"23_CR28","doi-asserted-by":"publisher","first-page":"108366","DOI":"10.1016\/j.patcog.2021.108366","volume":"122","author":"JH Tan","year":"2022","unstructured":"Tan, J.H., Chan, C.S., Chuah, J.H.: End-to-end supermask pruning: learning to prune image captioning models. Pattern Recogn. 122, 108366 (2022)","journal-title":"Pattern Recogn."},{"key":"23_CR29","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1016\/j.neucom.2022.01.081","volume":"482","author":"JH Tan","year":"2022","unstructured":"Tan, J.H., Tan, Y.H., Chan, C.S., Chuah, J.H.: ACORT: a compact object relation transformer for parameter efficient image captioning. Neurocomputing 482, 60\u201372 (2022)","journal-title":"Neurocomputing"},{"key":"23_CR30","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Advances in Neural Information Processing Systems, vol. 30 (2017)"},{"key":"23_CR31","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"23_CR32","unstructured":"Wang, W., Wei, F., Dong, L., Bao, H., Yang, N., Zhou, M.: MINILM: deep self-attention distillation for task-agnostic compression of pre-trained transformers. In: Advances in Neural Information Processing Systems, vol. 33, pp. 5776\u20135788 (2020)"},{"issue":"5","key":"23_CR33","first-page":"2313","volume":"44","author":"X Yang","year":"2020","unstructured":"Yang, X., Zhang, H., Cai, J.: Auto-encoding and distilling scene graphs for image captioning. IEEE Trans. Pattern Anal. Mach. Intell. 44(5), 2313\u20132327 (2020)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"issue":"4","key":"23_CR34","first-page":"5099","volume":"45","author":"Q Zhang","year":"2022","unstructured":"Zhang, Q., Cheng, X., Chen, Y., Rao, Z.: Quantifying the knowledge in a DNN to explain knowledge distillation for classification. IEEE Trans. Pattern Anal. Mach. Intell. 45(4), 5099\u20135113 (2022)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"23_CR35","doi-asserted-by":"crossref","unstructured":"Zhang, X., et al.: RSTNet: captioning with adaptive attention on visual and non-visual words. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp. 15465\u201315474 (2021)","DOI":"10.1109\/CVPR46437.2021.01521"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53308-2_23","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T11:10:22Z","timestamp":1710241822000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53308-2_23"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533075","9783031533082"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53308-2_23","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}