{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,26]],"date-time":"2026-02-26T23:05:30Z","timestamp":1772147130403,"version":"3.50.1"},"reference-count":48,"publisher":"Springer Science and Business Media LLC","issue":"12","license":[{"start":{"date-parts":[[2020,10,27]],"date-time":"2020-10-27T00:00:00Z","timestamp":1603756800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,10,27]],"date-time":"2020-10-27T00:00:00Z","timestamp":1603756800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61966004"],"award-info":[{"award-number":["61966004"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National Natural Science Foundatioin of China","award":["61663004"],"award-info":[{"award-number":["61663004"]}]},{"DOI":"10.13039\/501100004607","name":"Natural Science Foundation of Guangxi Province","doi-asserted-by":"publisher","award":["2019GXNSFDA245018"],"award-info":[{"award-number":["2019GXNSFDA245018"]}],"id":[{"id":"10.13039\/501100004607","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Mach Learn"],"published-print":{"date-parts":[[2020,12]]},"DOI":"10.1007\/s10994-020-05919-y","type":"journal-article","created":{"date-parts":[[2020,10,27]],"date-time":"2020-10-27T17:13:10Z","timestamp":1603818790000},"page":"2313-2332","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":33,"title":["Boost image captioning with knowledge reasoning"],"prefix":"10.1007","volume":"109","author":[{"given":"Feicheng","family":"Huang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5313-6134","authenticated-orcid":false,"given":"Zhixin","family":"Li","sequence":"additional","affiliation":[]},{"given":"Haiyang","family":"Wei","sequence":"additional","affiliation":[]},{"given":"Canlong","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Huifang","family":"Ma","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2020,10,27]]},"reference":[{"key":"5919_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S., & Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6077\u20136086).","DOI":"10.1109\/CVPR.2018.00636"},{"key":"5919_CR2","unstructured":"Anne\u00a0H. L., Venugopalan, S., Rohrbach, M., Mooney, R., Saenko, K., & Darrell, T. (2016). Deep compositional captioning: Describing novel object categories without paired training data. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1\u201310)."},{"key":"5919_CR3","doi-asserted-by":"publisher","first-page":"291","DOI":"10.1016\/j.neucom.2018.05.080","volume":"311","author":"S Bai","year":"2018","unstructured":"Bai, S., & An, S. (2018). A survey on automatic image caption generation. Neurocomputing, 311, 291\u2013304.","journal-title":"Neurocomputing"},{"key":"5919_CR4","unstructured":"Banerjee, S., & Lavie, A. (2005). Meteor: An automatic metric for mt evaluation with improved correlation with human judgments. In Proceedings of the acl workshop on intrinsic and extrinsic evaluation measures for machine translation and\/or summarization (pp. 65\u201372)."},{"key":"5919_CR5","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., & Shazeer, N. (2015). Scheduled sampling for sequence prediction with recurrent neural networks. In Advances in neural information processing systems (pp. 1171\u20131179)."},{"key":"5919_CR6","doi-asserted-by":"crossref","unstructured":"Chen, L., Zhang, H., Xiao, J., Nie, L., Shao, J., Liu, W., & Chua, T.S. (2017a). Sca-cnn: Spatial and channel-wise attention in convolutional networks for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5659\u20135667).","DOI":"10.1109\/CVPR.2017.667"},{"key":"5919_CR7","doi-asserted-by":"crossref","unstructured":"Chen, M., Ding, G., Zhao, S., Chen, H., Liu, Q., & Han, J. (2017b). Reference based lstm for image captioning. In Thirty-first AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v31i1.11198"},{"key":"5919_CR8","doi-asserted-by":"crossref","unstructured":"Chen, T.H., Liao, Y.H., Chuang, C.Y., Hsu, W.T., Fu, J., & Sun, M. (2017c). Show, adapt and tell: Adversarial training of cross-domain image captioner. In Proceedings of the IEEE international conference on computer vision (pp. 521\u2013530).","DOI":"10.1109\/ICCV.2017.64"},{"key":"5919_CR9","unstructured":"Chunseong\u00a0Park, C., Kim, B., & Kim, G. (2017). Attend to you: personalized image captioning with context sequence memory networks. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 895\u2013903)."},{"issue":"2","key":"5919_CR10","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3177745","volume":"14","author":"M Cornia","year":"2018","unstructured":"Cornia, M., Baraldi, L., Serra, G., & Cucchiara, R. (2018). Paying more attention to saliency: Image captioning with saliency and context attention. ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM), 14(2), 1\u201321.","journal-title":"ACM Transactions on Multimedia Computing, Communications, and Applications (TOMM)"},{"key":"5919_CR11","doi-asserted-by":"crossref","unstructured":"Gu, J., Zhao, H., Lin, Z., Li, S., Cai, J., & Ling, M. (2019). Scene graph generation with external knowledge and image reconstruction (pp. 1969\u20131978).","DOI":"10.1109\/CVPR.2019.00207"},{"issue":"6","key":"5919_CR12","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1145\/3295748","volume":"51","author":"MZ Hossain","year":"2019","unstructured":"Hossain, M. Z., Sohel, F., Shiratuddin, M. F., & Laga, H. (2019). A comprehensive survey of deep learning for image captioning. ACM Computing Surveys (CSUR), 51(6), 1\u201336.","journal-title":"ACM Computing Surveys (CSUR)"},{"key":"5919_CR13","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., & Wei, X. (2019). Attention on attention for image captioning (pp. 4634\u20134643).","DOI":"10.1109\/ICCV.2019.00473"},{"key":"5919_CR14","doi-asserted-by":"crossref","unstructured":"Jia, X., Gavves, E., Fernando, B., & Tuytelaars, T. (2015). Guiding the long-short term memory model for image caption generation. In Proceedings of the IEEE international conference on computer vision (pp. 2407\u20132415).","DOI":"10.1109\/ICCV.2015.277"},{"key":"5919_CR15","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Chen, X., Zhang, H., & Liu, W. (2018a). Learning to guide decoding for image captioning. In Thirty-second AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v32i1.12283"},{"key":"5919_CR16","doi-asserted-by":"crossref","unstructured":"Jiang, W., Ma, L., Jiang, Y.G., Liu, W., & Zhang, T. (2018b). Recurrent fusion network for image captioning. In Proceedings of the European conference on computer vision (ECCV) (pp. 499\u2013515).","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"5919_CR17","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., & Fei-Fei, L. (2016). Densecap: Fully convolutional localization networks for dense captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4565\u20134574).","DOI":"10.1109\/CVPR.2016.494"},{"key":"5919_CR18","doi-asserted-by":"crossref","unstructured":"Kim, B., Han\u00a0Lee, Y., Jung, H., & Cho, C. (2018). Distinctive-attribute extraction for image captioning. In Proceedings of the European conference on computer vision (ECCV) (pp. 0\u20130).","DOI":"10.1007\/978-3-030-11018-5_12"},{"key":"5919_CR19","unstructured":"Li, G., Su, H., & Zhu, W. (2017a). Incorporating external knowledge to answer open-domain visual questions with dynamic memory networks. arXiv: Computer Vision and Pattern Recognition."},{"key":"5919_CR20","doi-asserted-by":"crossref","unstructured":"Li, L., Tang, S., Deng, L., Zhang, Y., & Tian, Q. (2017b). Image caption with global-local attention. In Thirty-first AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v31i1.11236"},{"key":"5919_CR21","doi-asserted-by":"crossref","unstructured":"Li, X., Jiang, S., & Han, J. (2019a). Learning object context for dense captioning. In Proceedings of the AAAI conference on artificial intelligence (Vol. 33, pp. 8650\u20138657).","DOI":"10.1609\/aaai.v33i01.33018650"},{"key":"5919_CR22","doi-asserted-by":"crossref","unstructured":"Li, Y., Yao, T., Pan, Y., Chao, H., & Mei, T. (2019b). Pointing novel objects in image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 12497\u201312506).","DOI":"10.1109\/CVPR.2019.01278"},{"key":"5919_CR23","doi-asserted-by":"crossref","unstructured":"Lin, C.Y., & Hovy, E. (2003). Automatic evaluation of summaries using n-gram co-occurrence statistics. In Proceedings of the 2003 human language technology conference of the North American chapter of the association for computational linguistics (pp. 150\u2013157).","DOI":"10.3115\/1073445.1073465"},{"key":"5919_CR24","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, J., Sha, F., & Yuille, A. (2017a). Attention correctness in neural image captioning. In Thirty-first AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"5919_CR25","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., & Murphy, K. (2016). Optimization of image description metrics using policy gradient methods. arXiv preprint arXiv:1612.003705"},{"key":"5919_CR26","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., & Murphy, K. (2017b). Improved image captioning via policy gradient optimization of spider. In Proceedings of the IEEE international conference on computer vision (pp. 873\u2013881).","DOI":"10.1109\/ICCV.2017.100"},{"key":"5919_CR27","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., & Socher, R. (2017). Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 375\u2013383).","DOI":"10.1109\/CVPR.2017.345"},{"key":"5919_CR28","doi-asserted-by":"crossref","unstructured":"Mathews, A.P., Xie, L., && He, X. (2016). Senticap: Generating image descriptions with sentiments. In Thirtieth AAAI conference on artificial intelligence.","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"5919_CR29","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., & Zhu, W.J. (2002). Bleu: a method for automatic evaluation of machine translation. In Proceedings of the 40th annual meeting on association for computational linguistics (pp. 311\u2013318). Association for Computational Linguistics.","DOI":"10.3115\/1073083.1073135"},{"key":"5919_CR30","doi-asserted-by":"crossref","unstructured":"Pedersoli, M., Lucas, T., Schmid, C., & Verbeek, J. (2017). Areas of attention for image captioning. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1242\u20131250.","DOI":"10.1109\/ICCV.2017.140"},{"key":"5919_CR31","doi-asserted-by":"crossref","unstructured":"Qin, Y., Du, J., Zhang, Y., & Lu, H. (2019). Look back and predict forward in image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 8367\u20138375).","DOI":"10.1109\/CVPR.2019.00856"},{"key":"5919_CR32","doi-asserted-by":"crossref","unstructured":"Ramanishka, V., Das, A., Zhang, J., & Saenko, K. (2017). Top-down visual saliency guided by captions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 7206\u20137215).","DOI":"10.1109\/CVPR.2017.334"},{"key":"5919_CR33","unstructured":"Ranzato, M., Chopra, S., Auli, M., & Zaremba, W. (2015). Sequence level training with recurrent neural networks. arXiv preprint arXiv:1511.06732"},{"key":"5919_CR34","unstructured":"Ren, S., He, K., Girshick, R., & Sun, J. (2015). Faster r-cnn: Towards real-time object detection with region proposal networks. In Advances in neural information processing systems (pp. 91\u201399)."},{"key":"5919_CR35","doi-asserted-by":"crossref","unstructured":"Ren, Z., Wang, X., Zhang, N., Lv, X., & Li, L.J. (2017). Deep reinforcement learning-based image captioning with embedding reward. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 290\u2013298).","DOI":"10.1109\/CVPR.2017.128"},{"key":"5919_CR36","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., & Goel, V. (2017). Self-critical sequence training for image captioning. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 7008\u20137024).","DOI":"10.1109\/CVPR.2017.131"},{"key":"5919_CR37","unstructured":"Speer, R., Chin, J., & Havasi, C. (2017). Conceptnet 5.5: An open multilingual graph of general knowledge. In Thirty-first AAAI conference on artificial intelligence."},{"key":"5919_CR38","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Lawrence\u00a0Zitnick, C., & Parikh, D. (2015). Cider: Consensus-based image description evaluation. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4566\u20134575).","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"5919_CR39","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156\u20133164).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"5919_CR40","doi-asserted-by":"crossref","unstructured":"Wang, Y., Lin, Z., Shen, X., Cohen, S., & Cottrell, G.W. (2017). Skeleton key: Image captioning by skeleton-attribute decomposition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 7272\u20137281).","DOI":"10.1109\/CVPR.2017.780"},{"issue":"6","key":"5919_CR41","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2017","unstructured":"Wu, Q., Shen, C., Wang, P., Dick, A., & van den Hengel, A. (2017). Image captioning and visual question answering based on attributes and external knowledge. IEEE Transactions on Pattern Analysis and Machine Intelligence, 40(6), 1367\u20131381.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"5919_CR42","unstructured":"Wu, Z., & Cohen, R. (2016). Encode, review, and decode: Reviewer module for caption generation. arXiv preprint arXiv:1605.07912"},{"key":"5919_CR43","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning (pp. 2048\u20132057)."},{"key":"5919_CR44","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., & Mei, T. (2017a). Incorporating copying mechanism in image captioning for learning novel objects. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 6580\u20136588).","DOI":"10.1109\/CVPR.2017.559"},{"key":"5919_CR45","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T. (2019). Hierarchy parsing for image captioning (pp. 2621\u20132629).","DOI":"10.1109\/ICCV.2019.00271"},{"key":"5919_CR46","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Qiu, Z., & Mei, T. (2017b). Boosting image captioning with attributes. In Proceedings of the IEEE international conference on computer vision (pp. 4894\u20134902).","DOI":"10.1109\/ICCV.2017.524"},{"key":"5919_CR47","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., & Luo, J. (2016). Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 4651\u20134659).","DOI":"10.1109\/CVPR.2016.503"},{"key":"5919_CR48","doi-asserted-by":"crossref","unstructured":"Zhou, Y., Sun, Y., & Honavar, V. (2019). Improving image captioning by leveraging knowledge graphs. In 2019 IEEE winter conference on applications of computer vision (WACV) (pp. 283\u2013293). IEEE.","DOI":"10.1109\/WACV.2019.00036"}],"container-title":["Machine Learning"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-020-05919-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10994-020-05919-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10994-020-05919-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,11,24]],"date-time":"2022-11-24T23:53:53Z","timestamp":1669334033000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10994-020-05919-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,10,27]]},"references-count":48,"journal-issue":{"issue":"12","published-print":{"date-parts":[[2020,12]]}},"alternative-id":["5919"],"URL":"https:\/\/doi.org\/10.1007\/s10994-020-05919-y","relation":{},"ISSN":["0885-6125","1573-0565"],"issn-type":[{"value":"0885-6125","type":"print"},{"value":"1573-0565","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020,10,27]]},"assertion":[{"value":"15 April 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"21 July 2020","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 September 2020","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"27 October 2020","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}