{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,14]],"date-time":"2026-01-14T14:51:16Z","timestamp":1768402276698,"version":"3.49.0"},"reference-count":57,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61572108"],"award-info":[{"award-number":["61572108"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61632007"],"award-info":[{"award-number":["61632007"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100013314","name":"111 Project","doi-asserted-by":"crossref","award":["B17008"],"award-info":[{"award-number":["B17008"]}],"id":[{"id":"10.13039\/501100013314","id-type":"DOI","asserted-by":"crossref"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2019,1]]},"DOI":"10.1109\/tip.2018.2855415","type":"journal-article","created":{"date-parts":[[2018,7,12]],"date-time":"2018-07-12T19:05:00Z","timestamp":1531422300000},"page":"32-44","source":"Crossref","is-referenced-by-count":76,"title":["More is Better: Precise and Detailed Image Captioning Using Online Positive Recall and Missing Concepts Mining"],"prefix":"10.1109","volume":"28","author":[{"given":"Mingxing","family":"Zhang","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5070-4511","authenticated-orcid":false,"given":"Yang","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Hanwang","family":"Zhang","sequence":"additional","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9122-6141","authenticated-orcid":false,"given":"Yanli","family":"Ji","sequence":"additional","affiliation":[]},{"given":"Heng Tao","family":"Shen","sequence":"additional","affiliation":[]},{"given":"Tat-Seng","family":"Chua","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298965"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2017.2763618"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2612883"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2676345"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2016.2515990"},{"key":"ref37","doi-asserted-by":"crossref","first-page":"1345","DOI":"10.1145\/3123266.3123391","article-title":"Adaptively attending to visual attributes and linguistic knowledge for captioning","author":"bin","year":"2017","journal-title":"Proc ACM Multimedia Conf"},{"key":"ref36","first-page":"203","article-title":"What value do explicit high level concepts have in vision to language problems?","author":"wu","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2017.2673241"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2017.2703636"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TCYB.2014.2344015"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/TKDE.2017.2701825"},{"key":"ref2","author":"kiros","year":"2014","journal-title":"Unifying Visual-Semantic Embeddings with Multimodal Neural Language Models"},{"key":"ref1","first-page":"2625","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref20","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref21","first-page":"595","article-title":"Multimodal neural language models","volume":"14","author":"kiros","year":"2014","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref24","author":"pedersoli","year":"2016","journal-title":"Areas of attention for image captioning"},{"key":"ref23","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proceedings of the 32nd Intl Conf on Machine Learning"},{"key":"ref26","first-page":"2361","article-title":"Review networks for caption generation","author":"yang","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref25","first-page":"261","article-title":"Deep semantic indexing using convolutional localization network with region-based visual attention for image database","author":"zhang","year":"2017","journal-title":"Proc Australasian Database Conf"},{"key":"ref50","article-title":"Entropy regularization","author":"grandvalet","year":"2005","journal-title":"Semi-Supervised Learning"},{"key":"ref51","doi-asserted-by":"crossref","first-page":"675","DOI":"10.1145\/2647868.2654889","article-title":"Caffe: Convolutional architecture for fast feature embedding","author":"jia","year":"2014","journal-title":"Proc 22nd ACM Int Conf Multimedia"},{"key":"ref57","first-page":"770","article-title":"Deep residual learning for image recognition","author":"he","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref56","author":"simonyan","year":"2014","journal-title":"Very Deep Convolutional Networks for Large-scale Image Recognition"},{"key":"ref55","first-page":"7272","article-title":"Skeleton key: Image captioning by skeleton-attribute decomposition","author":"wang","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref54","first-page":"6298","article-title":"SCA-CNN: Spatial and channel-wise attention in convolutional networks for image captioning","author":"chen","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit (CVPR)"},{"key":"ref53","author":"zhou","year":"2016","journal-title":"Watch what you just said Image captioning with text-conditional attention"},{"key":"ref52","author":"kingma","year":"2014","journal-title":"Adam A method for stochastic optimization"},{"key":"ref10","first-page":"177","article-title":"A tutorial on multi-label classification techniques","volume":"5","author":"de carvalho","year":"2009","journal-title":"Foundations of Computational Intelligence"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2717185"},{"key":"ref40","first-page":"6504","article-title":"Video captioning with transferred semantic attributes","author":"pan","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref12","first-page":"570","article-title":"A framework for multiple-instance learning","author":"maron","year":"1998","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref13","article-title":"Learning bidirectional temporal cues for video-based person re-identification","author":"zhang","year":"0","journal-title":"IEEE Trans Circuits Syst Video Technol"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2806279"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2017.09.012"},{"key":"ref16","article-title":"Transfer hashing: From shallow to deep","author":"zhou","year":"0","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"ref17","article-title":"Describing video with attention-based bidirectional LSTM","author":"bin","year":"0","journal-title":"IEEE Trans Cybern"},{"key":"ref18","author":"bahdanau","year":"2014","journal-title":"Neural machine translation by jointly learning to align and translate"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref3","author":"mao","year":"2014","journal-title":"Deep captioning with multimodal recurrent neural networks (m-rnn)"},{"key":"ref6","first-page":"4651","article-title":"Image captioning with semantic attention","author":"you","year":"2016","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"ref8","first-page":"5630","article-title":"Semantic compositional networks for visual captioning","author":"gan","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref7","author":"yao","year":"2016","journal-title":"Boosting image captioning with attributes"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1109\/ICASSP.2013.6638947"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.4018\/jdwm.2007070101"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"ref45","first-page":"5752","article-title":"Diverse and accurate image description using a variational auto-encoder with an additive Gaussian encoding space","author":"wang","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2855422"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"ref42","first-page":"2957","article-title":"Diverse image captioning via GroupTalk","author":"wang","year":"2016","journal-title":"Proc Int Joint Conf Artif Intell"},{"key":"ref41","first-page":"3261","article-title":"End-to-end concept word detection for video captioning, retrieval, and question answering","author":"yu","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref44","first-page":"5753","article-title":"Captioning images with diverse objects","author":"venugopalan","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref43","author":"vijayakumar","year":"2016","journal-title":"Diverse beam search Decoding diverse solutions from neural sequence models"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/83\/8468142\/08410582.pdf?arnumber=8410582","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,27]],"date-time":"2022-08-27T10:49:02Z","timestamp":1661597342000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/8410582\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,1]]},"references-count":57,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tip.2018.2855415","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,1]]}}}