{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,3]],"date-time":"2026-02-03T08:58:30Z","timestamp":1770109110222,"version":"3.49.0"},"reference-count":48,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2017YFA0700904"],"award-info":[{"award-number":["2017YFA0700904"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U19B2034"],"award-info":[{"award-number":["U19B2034"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61836014"],"award-info":[{"award-number":["61836014"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62061136001"],"award-info":[{"award-number":["62061136001"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/100004358","name":"grant from Samsung Research China, Beijing","doi-asserted-by":"publisher","id":[{"id":"10.13039\/100004358","id-type":"DOI","asserted-by":"publisher"}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. on Image Process."],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/tip.2021.3051476","type":"journal-article","created":{"date-parts":[[2021,1,20]],"date-time":"2021-01-20T15:54:08Z","timestamp":1611158048000},"page":"2450-2460","source":"Crossref","is-referenced-by-count":20,"title":["Vocabulary-Wide Credit Assignment for Training Image Captioning Models"],"prefix":"10.1109","volume":"30","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-3051-1990","authenticated-orcid":false,"given":"Han","family":"Liu","sequence":"first","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Institute for Artificial Intelligence, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8305-9566","authenticated-orcid":false,"given":"Shifeng","family":"Zhang","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Institute for Artificial Intelligence, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ke","family":"Lin","sequence":"additional","affiliation":[{"name":"Samsung Research China, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jing","family":"Wen","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Institute for Artificial Intelligence, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4937-2433","authenticated-orcid":false,"given":"Jianmin","family":"Li","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Institute for Artificial Intelligence, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4907-7354","authenticated-orcid":false,"given":"Xiaolin","family":"Hu","sequence":"additional","affiliation":[{"name":"Department of Computer Science and Technology, State Key Laboratory of Intelligent Technology and Systems, Beijing National Research Center for Information Science and Technology, Institute for Artificial Intelligence, Tsinghua University, Beijing, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"263","reference":[{"key":"ref39","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref38","first-page":"3104","article-title":"Sequence to sequence learning with neural networks","author":"sutskever","year":"2014","journal-title":"Advances in neural information processing systems"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref31","first-page":"4601","article-title":"Professor forcing: A new algorithm for training recurrent networks","author":"lamb","year":"2016","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-4012"},{"key":"ref36","first-page":"1","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2015","journal-title":"Proc 3rd Int Conf Learn Represent"},{"key":"ref35","first-page":"1","article-title":"Deep captioning with multimodal recurrent neural networks (M-RNN)","author":"mao","year":"2015","journal-title":"Proc 3rd Int Conf Learn Represent"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref10","first-page":"1171","article-title":"Scheduled sampling for sequence prediction with recurrent neural networks","author":"bengio","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref40","article-title":"Microsoft COCO captions: Data collection and evaluation server","author":"chen","year":"2015","journal-title":"arXiv 1504 00325"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref12","first-page":"311","article-title":"Bleu: A method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref13","first-page":"1","article-title":"Sequence level training with recurrent neural networks","author":"ranzato","year":"2016","journal-title":"Proc 4th Int Conf Learn Represent"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref15","first-page":"684","article-title":"Exploring visual relationship for image captioning","author":"yao","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"ref18","first-page":"1","article-title":"Actor-critic sequence training for image captioning","author":"zhang","year":"2017","journal-title":"Proc NIPS Workshop Visually-Grounded Interact Lang"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00646"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/TCSVT.2019.2947482"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref27","first-page":"5998","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.162"},{"key":"ref6","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref7","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00188"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref1","first-page":"1143","article-title":"Im2Text: Describing images using 1 million captioned photographs","author":"ordonez","year":"2011","journal-title":"Proc Adv Neural Inf Process Syst"},{"key":"ref46","first-page":"1","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2015","journal-title":"Proc 3rd Int Conf Learn Represent"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240632"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref47","first-page":"1","article-title":"Stack-captioning: Coarse-to-fine learning for image captioning","author":"gu","year":"2018","journal-title":"Proc 32nd Assoc Adv Artif Intell Conf Artif Intell"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"ref42","first-page":"10","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"lin","year":"2004","journal-title":"Proc ACL Workshop Text Summarization Braches Out"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref26","article-title":"Visual genome: Connecting language and vision using crowdsourced dense image annotations","author":"krishna","year":"2016","journal-title":"arXiv 1602 07332"},{"key":"ref43","first-page":"65","article-title":"Meteor: An automatic metric for MT evaluation with improved correlation with human judgments","author":"banerjee","year":"2005","journal-title":"Proc ACL Workshop Intrinsic Extrinsic Eval Measures Mach Transl Summarization"},{"key":"ref25","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Proc Adv Neural Inf Process Syst"}],"container-title":["IEEE Transactions on Image Processing"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/83\/9263394\/09329055.pdf?arnumber=9329055","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,2,2]],"date-time":"2026-02-02T20:47:05Z","timestamp":1770065225000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9329055\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":48,"URL":"https:\/\/doi.org\/10.1109\/tip.2021.3051476","relation":{},"ISSN":["1057-7149","1941-0042"],"issn-type":[{"value":"1057-7149","type":"print"},{"value":"1941-0042","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}