{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,11,10]],"date-time":"2025-11-10T01:29:52Z","timestamp":1762738192250,"version":"3.37.3"},"reference-count":63,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2019,6,10]],"date-time":"2019-06-10T00:00:00Z","timestamp":1560124800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,6,10]],"date-time":"2019-06-10T00:00:00Z","timestamp":1560124800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["No. 61571269"],"award-info":[{"award-number":["No. 61571269"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Cogn Comput"],"published-print":{"date-parts":[[2021,7]]},"DOI":"10.1007\/s12559-019-09656-w","type":"journal-article","created":{"date-parts":[[2019,6,10]],"date-time":"2019-06-10T08:02:54Z","timestamp":1560153774000},"page":"807-820","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":18,"title":["Image Captioning with Memorized Knowledge"],"prefix":"10.1007","volume":"13","author":[{"given":"Hui","family":"Chen","sequence":"first","affiliation":[]},{"given":"Guiguang","family":"Ding","sequence":"additional","affiliation":[]},{"given":"Zijia","family":"Lin","sequence":"additional","affiliation":[]},{"given":"Yuchen","family":"Guo","sequence":"additional","affiliation":[]},{"given":"Caifeng","family":"Shan","sequence":"additional","affiliation":[]},{"given":"Jungong","family":"Han","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,6,10]]},"reference":[{"key":"9656_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L. 2017. Bottom-up and top-down attention for image captioning and vqa. arXiv:1707.07998.","DOI":"10.1109\/CVPR.2018.00636"},{"key":"9656_CR2","unstructured":"Banerjee S, Lavie A. Meteor: an automatic metric for mt evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization. 2005. vol. 29, p. 65\u201372."},{"key":"9656_CR3","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Guo Y, Han J. Attend to knowledge: memory-enhanced attention network for image captioning. International Conference on Brain Inspired Cognitive Systems. Springer; 2018. p. 161\u201371.","DOI":"10.1007\/978-3-030-00563-4_16"},{"key":"9656_CR4","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Han J. Show, observe and tell: attribute-driven attention model for image captioning. Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, IJCAI-18. International Joint Conferences on Artificial Intelligence Organization; 2018. p. 606\u201312.","DOI":"10.24963\/ijcai.2018\/84"},{"key":"9656_CR5","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Zhao S, Han J. 2018. Temporal-difference learning with sampling baseline for image captioning. AAAI Conference on Artificial Intelligence.","DOI":"10.1609\/aaai.v32i1.12263"},{"key":"9656_CR6","doi-asserted-by":"crossref","unstructured":"Chen L, Zhang H, Xiao J, Nie L, Shao J, Chua TS. 2017. Sca-cnn: spatial and channel-wise attention in convolutional networks for image captioning CVPR.","DOI":"10.1109\/CVPR.2017.667"},{"key":"9656_CR7","doi-asserted-by":"crossref","unstructured":"Chen M, Ding G, Zhao S, Chen H, Liu Q, Han J. 2017. Reference based LSTM for image captioning AAAI.","DOI":"10.1609\/aaai.v31i1.11198"},{"key":"9656_CR8","doi-asserted-by":"crossref","unstructured":"Cho K, Van Merri\u00ebnboer B, G\u00fcl\u00e7ehre \u00c7, Bahdanau D, Bougares F, Schwenk H, Bengio Y. 2014. Learning phrase representations using RNN encoder-decoder for statistical machine translation. In Conference on Empirical Methods on Natural Language processing. 2014. p. 1724\u201334.","DOI":"10.3115\/v1\/D14-1179"},{"key":"9656_CR9","doi-asserted-by":"crossref","unstructured":"Devlin J, Cheng H, Fang H, Gupta S, Deng L, He X, Zweig G, Mitchell M. 2015. Language models for image captioning: the quirks and what works. In Annual Meeting of the Association for Computational Linguistics. 2015. p. 100\u20135.","DOI":"10.3115\/v1\/P15-2017"},{"key":"9656_CR10","unstructured":"Devlin J, Gupta S, Girshick R, Mitchell M, Zitnick CL. 2015. Exploring nearest neighbor approaches for image captioning. arXiv:1505.04467."},{"key":"9656_CR11","doi-asserted-by":"publisher","unstructured":"Ding G, Chen M, Zhao S, Chen H, Han J, Liu Q. 2018. Neural image caption generation with weighted training and reference. Cognitive Computation. https:\/\/doi.org\/10.1007\/s12559-018-9581-x.","DOI":"10.1007\/s12559-018-9581-x"},{"key":"9656_CR12","doi-asserted-by":"crossref","unstructured":"Ding G, Guo Y, Chen K, Chu C, Han J, Dai Q. 2019. Decode: deep confidence network for robust image classification. IEEE Transactions on Image Processing.","DOI":"10.1109\/TIP.2019.2902115"},{"issue":"11","key":"9656_CR13","first-page":"5427","volume":"25","author":"G Ding","year":"2016","unstructured":"Ding G, Guo Y, Zhou J, Gao Y. Large-scale cross-modality search via collective matrix factorization hashing. TIP 2016;25(11):5427\u201340.","journal-title":"TIP"},{"key":"9656_CR14","unstructured":"Dodds A. 2013. Rehabilitating blind and visually impaired people: a psychological approach. Springer."},{"key":"9656_CR15","unstructured":"Elliott D, Keller F. Image description using visual dependency representations. In Conference on Empirical Methods on Natural Language Processing. 2013. p. 1292\u2013302."},{"key":"9656_CR16","unstructured":"Fakoor R, Mohamed Ar, Mitchell M, Kang SB, Kohli P. 2016. Memory-augmented attention modelling for videos. arXiv:1611.02261."},{"key":"9656_CR17","doi-asserted-by":"crossref","unstructured":"Farhadi A, Hejrati M, Sadeghi MA, Young P, Rashtchian C, Hockenmaier J, Forsyth D. Every picture tells a story: generating sentences from images. In European Conference on Computer Vision. 2010. p. 15\u201329.","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"9656_CR18","doi-asserted-by":"crossref","unstructured":"Gan Z, Gan C, He X, Pu Y, Tran K, Gao J, Carin L, Deng L. 2017. Semantic compositional networks for visual captioning. In CVPR.","DOI":"10.1109\/CVPR.2017.127"},{"key":"9656_CR19","doi-asserted-by":"crossref","unstructured":"Gong Y, Wang L, Hodosh M, Hockenmaier J, Lazebnik S. Improving image-sentence embeddings using large weakly annotated photo collections. In European Conference on Computer Vision. 2014. p. 529\u201345.","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"9656_CR20","doi-asserted-by":"crossref","unstructured":"Gu J, Cai J, Wang G, Chen T. 2018. Stack-captioning: coarse-to-fine learning for image captioning. In AAAI.","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"9656_CR21","doi-asserted-by":"publisher","first-page":"770","DOI":"10.1109\/CVPR.2016.90","volume":"00","author":"K He","year":"2016","unstructured":"He K, Zhang X, Ren S, Sun J. . Deep residual learning for image recognition 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR) 2016;00:770\u2013778.","journal-title":"Deep residual learning for image recognition 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"},{"issue":"8","key":"9656_CR22","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J. Long short-term memory. Neural Comput 1997;9(8):1735\u20131780.","journal-title":"Neural Comput"},{"key":"9656_CR23","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J. Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 2013;47:853\u201399.","journal-title":"J Artif Intell Res"},{"key":"9656_CR24","doi-asserted-by":"crossref","unstructured":"Jia X, Gavves E, Fernando B, Tuytelaars T. 2015. Guiding the long-short term memory model for image caption generation. In IEEE International Conference on Computer Vision. 2015. p. 2407\u201315.","DOI":"10.1109\/ICCV.2015.277"},{"key":"9656_CR25","unstructured":"Jin J, Fu K, Cui R, Sha F, Zhang C. 2015. Aligning where to see and what to tell: image caption with region-based attention and scene factorization. arXiv:1506.06272."},{"key":"9656_CR26","unstructured":"Kaiser L, Nachum O, Roy A, Bengio S. 2017. Learning to remember rare events CVPR."},{"key":"9656_CR27","doi-asserted-by":"crossref","unstructured":"Karpathy A, Li FF. Deep visual-semantic alignments for generating image descriptions. In IEEE Conference on Computer Vision and Pattern Recognition. 2015. p. 3128\u201337.","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9656_CR28","unstructured":"Krizhevsky A, Sutskever I, Hinton GE. Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems. 2012. p. 1097\u2013105."},{"key":"9656_CR29","doi-asserted-by":"crossref","unstructured":"Kulkarni G, Premraj V, Dhar S, Li S, Choi Y, Berg A, Berg T. Baby talk: understanding and generating simple image descriptions. In IEEE Conference on Computer Vision and Pattern Recognition. 2011. p. 1601\u20138.","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"9656_CR30","unstructured":"Kuznetsova P, Ordonez V, Berg A, Berg T, Choi Y. Collective generation of natural image descriptions. In Annual Meeting of the Association for Computational Linguistics. 2012. p. 359\u201368."},{"issue":"10","key":"9656_CR31","first-page":"351","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova P, Ordonez V, Berg T, Choi Y. Treetalk: composition and compression of trees for image descriptions. Trans Assoc Comput Ling 2014;2(10):351\u201362.","journal-title":"Trans Assoc Comput Ling"},{"issue":"12","key":"9656_CR32","doi-asserted-by":"publisher","first-page":"5826","DOI":"10.1109\/TIP.2015.2481325","volume":"24","author":"X Lan","year":"2015","unstructured":"Lan X, Ma A, Yuen PC, Chellappa R. Joint sparse representation and robust feature-level fusion for multi-cue visual tracking. IEEE Trans Image Process 2015;24(12):5826.","journal-title":"IEEE Trans Image Process"},{"key":"9656_CR33","doi-asserted-by":"publisher","unstructured":"Lan X, Ye M, Shao R, Zhong B, Yuen PC, Zhou H. Learning modality-consistency feature templates: a robust rgb-infrared tracking system. IEEE Trans Ind Electron. 2019:1\u20131. https:\/\/doi.org\/10.1109\/TIE.2019.2898618.","DOI":"10.1109\/TIE.2019.2898618"},{"key":"9656_CR34","doi-asserted-by":"publisher","unstructured":"Lan X, Ye M, Zhang S, Zhou H, Yuen PC. Modality-correlation-aware sparse representation for RGB-infrared object tracking. Pattern Recogn Lett. 2018. https:\/\/doi.org\/10.1016\/j.patrec.2018.10.002.","DOI":"10.1016\/j.patrec.2018.10.002"},{"issue":"4","key":"9656_CR35","doi-asserted-by":"publisher","first-page":"2022","DOI":"10.1109\/TIP.2017.2777183","volume":"27","author":"X Lan","year":"2018","unstructured":"Lan X, Zhang S, Yuen PC, Chellappa R. Learning common and feature-specific patterns: a novel multiple-sparse-representation-based tracker. IEEE Trans Image Process 2018;27(4):2022\u201337.","journal-title":"IEEE Trans Image Process"},{"issue":"2","key":"9656_CR36","doi-asserted-by":"publisher","first-page":"368","DOI":"10.1007\/s12559-017-9533-x","volume":"10","author":"J Li","year":"2018","unstructured":"Li J, Zhang Z, He H. Hierarchical convolutional neural networks for EEG-based emotion recognition. Cogn Comput 2018;10(2):368\u201380.","journal-title":"Cogn Comput"},{"key":"9656_CR37","doi-asserted-by":"crossref","unstructured":"Li N, Chen Z. Image captioning with visual-semantic LSTM. Proceedings of the Twenty-Seventh International Joint Conference on Artificial Intelligence, IJCAI-18. International Joint Conferences on Artificial Intelligence Organization; 2018. p. 793\u2013799.","DOI":"10.24963\/ijcai.2018\/110"},{"key":"9656_CR38","doi-asserted-by":"crossref","unstructured":"Li Y, Pan Q, Yang T, Wang S, Tang J, Cambria E. Learning word representations for sentiment analysis. Cogn Comput. 2017;843\u2013851.","DOI":"10.1007\/s12559-017-9492-2"},{"key":"9656_CR39","doi-asserted-by":"crossref","unstructured":"Lin CY, Hovy E. Automatic evaluation of summaries using n-gram co-occurrence statistics. Proceedings of the 2003 Conference of the North American Chapter of the Association for Computational Linguistics on Human Language Technology. Association for Computational Linguistics; 2003. p. 71\u201378.","DOI":"10.3115\/1073445.1073465"},{"issue":"6","key":"9656_CR40","doi-asserted-by":"publisher","first-page":"2472","DOI":"10.1109\/TNNLS.2017.2691545","volume":"29","author":"Z Lin","year":"2018","unstructured":"Lin Z, Ding G, Han J, Shao L. End-to-end feature-aware label space encoding for multilabel classification with many classes. IEEE Trans Neural Netw Learn Syst 2018;29(6):2472\u201387.","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"9656_CR41","doi-asserted-by":"crossref","unstructured":"Lin Z, Ding G, Han J, Wang J. 2016. Cross-view retrieval via probability-based semantics-preserving hashing. IEEE Transactions on Cybernetics.","DOI":"10.1109\/TCYB.2016.2608906"},{"key":"9656_CR42","doi-asserted-by":"crossref","unstructured":"Liu S, Zhu Z, Ye N, Guadarrama S, Murphy K. Improved image captioning via policy gradient optimization of spider. In: Proceedings of the IEEE International Conference on Computer Vision. 2017. p. 873\u201381.","DOI":"10.1109\/ICCV.2017.100"},{"key":"9656_CR43","doi-asserted-by":"crossref","unstructured":"Liu X, Li H, Shao J, Chen D, Wang X. 2018. Show, tell and discriminate: image captioning by self-retrieval with partially labeled data. arXiv:1803.08314.","DOI":"10.1007\/978-3-030-01267-0_21"},{"issue":"2","key":"9656_CR44","doi-asserted-by":"publisher","first-page":"285","DOI":"10.1007\/s12559-017-9452-x","volume":"9","author":"Y Liu","year":"2017","unstructured":"Liu Y, Vong C, Wong P. Extreme learning machine for huge hypotheses re-ranking in statistical machine translation. Cogn Comput 2017;9(2):285\u201394.","journal-title":"Cogn Comput"},{"key":"9656_CR45","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R. 2017. Knowing when to look: adaptive attention via a visual sentinel for image captioning.","DOI":"10.1109\/CVPR.2017.345"},{"key":"9656_CR46","doi-asserted-by":"crossref","unstructured":"Luo R, Price B, Cohen S, Shakhnarovich G. 2018. Discriminability objective for training descriptive captions. arXiv:1803.04376.","DOI":"10.1109\/CVPR.2018.00728"},{"key":"9656_CR47","unstructured":"Mao J, Xu W, Yang Y, Wang J, Yuille AL. 2015. Deep captioning with multimodal recurrent neural networks (m-RNN). In International Conference on Learning Representations."},{"key":"9656_CR48","unstructured":"Mitchell M, Han X, Dodge J, Mensch A, Goyal A, Berg A, Yamaguchi K, Berg T, Stratos K, Daum\u00e9 H III. Midge: generating image descriptions from computer vision detections. In Conference of the European Chapter of the Association for Computational Linguistics. 2012. p. 747\u201356."},{"key":"9656_CR49","doi-asserted-by":"crossref","unstructured":"Papineni K, Roukos S, Ward T, Zhu WJ. Bleu: a method for automatic evaluation of machine translation. Proceedings of the 40th Annual Meeting on Association for Computational linguistics. Association for Computational Linguistics; 2002. p. 311\u20138.","DOI":"10.3115\/1073083.1073135"},{"key":"9656_CR50","unstructured":"Ranzato M, Chopra S, Auli M, Zaremba W. 2015. Sequence level training with recurrent neural networks. arXiv:1511.06732."},{"key":"9656_CR51","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V. 2016. Self-critical sequence training for image captioning CVPR.","DOI":"10.1109\/CVPR.2017.131"},{"key":"9656_CR52","unstructured":"Roopnarine J, Johnson JE. 2013. Approaches to early childhood education. Merrill\/Prentice Hall."},{"key":"9656_CR53","doi-asserted-by":"crossref","unstructured":"Vedantam R, Lawrence Zitnick C, Parikh D. Cider: consensus-based image description evaluation. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2015. p. 4566\u201375.","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"9656_CR54","doi-asserted-by":"crossref","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D. Show and tell: a neural image caption generator. InCVPR. 2015 p. 3156\u201364.","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"9656_CR55","doi-asserted-by":"crossref","unstructured":"Wang M, Lu Z, Li H, Liu Q. Memory-enhanced decoder for neural machine translation. In Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing. 2016. p. 278\u201386.","DOI":"10.18653\/v1\/D16-1027"},{"key":"9656_CR56","unstructured":"Weston J, Chopra S, Bordes A. 2014. Memory networks. arXiv:1410.3916."},{"issue":"4","key":"9656_CR57","doi-asserted-by":"publisher","first-page":"1993","DOI":"10.1109\/TIP.2018.2882155","volume":"28","author":"G Wu","year":"2019","unstructured":"Wu G, Han J, Guo Y, Liu L, Ding G, Ni Q, Shao L. Unsupervised deep video hashing via balanced code for large-scale video retrieval. IEEE Trans Image Process 2019;28(4):1993\u20132007.","journal-title":"IEEE Trans Image Process"},{"key":"9656_CR58","doi-asserted-by":"crossref","unstructured":"Wu G, Han J, Lin Z, Ding G, Zhang B, Ni Q. 2018. Joint image-text hashing for fast large-scale cross-media retrieval using self-supervised deep learning. IEEE Transactions on Industrial Electronics.","DOI":"10.1109\/TIE.2018.2873547"},{"key":"9656_CR59","unstructured":"Xu K, Ba J, Kiros R, Cho K, Courville A, Salakhudinov R, Zemel R, Bengio Y. Show, attend and tell: neural image caption generation with visual attention. In ICML. 2015. p. 2048\u201357."},{"key":"9656_CR60","unstructured":"Yang Z, Yuan Y, Wu Y, Salakhutdinov R, Cohen WW. 2016. Encode, review, and decode: reviewer module for caption generation NIPS."},{"key":"9656_CR61","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T. 2016. Boosting image captioning with attributes. arXiv:1611.01646.","DOI":"10.1109\/ICCV.2017.524"},{"key":"9656_CR62","doi-asserted-by":"crossref","unstructured":"You Q, Jin H, Wang Z, Fang C, Luo J. 2016. Image captioning with semantic attention. In IEEE Conference on Computer Vision and Pattern Recognition. 2016. p. 4651\u201359.","DOI":"10.1109\/CVPR.2016.503"},{"issue":"1","key":"9656_CR63","doi-asserted-by":"publisher","first-page":"179","DOI":"10.1007\/s12559-017-9515-z","volume":"10","author":"G Zhong","year":"2018","unstructured":"Zhong G, Yan S, Huang K, Cai Y, Dong J. Reducing and stretching deep convolutional activation features for accurate image classification. Cogn Comput 2018;10(1):179\u201386.","journal-title":"Cogn Comput"}],"container-title":["Cognitive Computation"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-019-09656-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s12559-019-09656-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s12559-019-09656-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,20]],"date-time":"2022-09-20T03:07:18Z","timestamp":1663643238000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s12559-019-09656-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,6,10]]},"references-count":63,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2021,7]]}},"alternative-id":["9656"],"URL":"https:\/\/doi.org\/10.1007\/s12559-019-09656-w","relation":{},"ISSN":["1866-9956","1866-9964"],"issn-type":[{"type":"print","value":"1866-9956"},{"type":"electronic","value":"1866-9964"}],"subject":[],"published":{"date-parts":[[2019,6,10]]},"assertion":[{"value":"12 November 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 May 2019","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"10 June 2019","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"This article does not contain any studies with human participants or animals performed by any of the authors.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"<!--Emphasis Type='Bold' removed-->Ethical Approval"}}]}}