{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,29]],"date-time":"2026-05-29T11:23:57Z","timestamp":1780053837275,"version":"3.54.0"},"reference-count":74,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"name":"National Key Research and Development Program of China","award":["2018YFC0830103"],"award-info":[{"award-number":["2018YFC0830103"]}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["61876045"],"award-info":[{"award-number":["61876045"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["U1811463"],"award-info":[{"award-number":["U1811463"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"National High Level Talents Special Support Plan"},{"DOI":"10.13039\/501100003453","name":"Natural Science Foundation of Guangdong Province","doi-asserted-by":"publisher","award":["2017A030312006"],"award-info":[{"award-number":["2017A030312006"]}],"id":[{"id":"10.13039\/501100003453","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Zhujiang Science and Technology New Star Project of Guangzhou","award":["201906010057"],"award-info":[{"award-number":["201906010057"]}]}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Multimedia"],"published-print":{"date-parts":[[2021]]},"DOI":"10.1109\/tmm.2020.3011317","type":"journal-article","created":{"date-parts":[[2020,7,23]],"date-time":"2020-07-23T20:37:22Z","timestamp":1595536642000},"page":"2413-2427","source":"Crossref","is-referenced-by-count":75,"title":["Fine-Grained Image Captioning With Global-Local Discriminative Objective"],"prefix":"10.1109","volume":"23","author":[{"given":"Jie","family":"Wu","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5848-5624","authenticated-orcid":false,"given":"Tianshui","family":"Chen","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-2132-6515","authenticated-orcid":false,"given":"Hefeng","family":"Wu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Zhi","family":"Yang","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Guangchun","family":"Luo","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0003-2248-3755","authenticated-orcid":false,"given":"Liang","family":"Lin","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1179"},{"key":"ref72","article-title":"Adam: A method for stochastic optimization","author":"kingma","year":"2014"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref70","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref39","first-page":"74","article-title":"Rouge: A package for automatic evaluation of summaries","volume":"8","author":"lin","year":"0","journal-title":"Proc ACL-04 Workshop Text Summarization Branches Out"},{"key":"ref74","first-page":"2579","article-title":"Visualizing data using T-SNE","volume":"9","author":"maaten","year":"2008","journal-title":"J Mach Learn Res"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.3115\/1073083.1073135"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01264-9_42"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2924576"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2832602"},{"key":"ref30","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2904878"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00646"},{"key":"ref36","article-title":"Sequence level training with recurrent neural networks","author":"ranzato","year":"2015"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00632"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref28","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2855406"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240523"},{"key":"ref64","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2855415"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896516"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref66","article-title":"Microsoft coco captions: Data collection and evaluation server","author":"chen","year":"2015"},{"key":"ref67","first-page":"139","article-title":"Collecting image annotations using amazon's mechanical turk","author":"rashtchian","year":"0","journal-title":"Proc NAACL HLT 2010 Workshop Creating Speech Lang Data Amazon's Mech Turk"},{"key":"ref68","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/TNNLS.2015.2506664"},{"key":"ref69","first-page":"65","article-title":"Meteor: An automatic metric for mt evaluation with improved correlation with human judgments","volume":"29","author":"banerjee","year":"0","journal-title":"Proc ACL Workshop Intrinsic Extrinsic Eval Measures Mach Transl Summarization"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2865280"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref22","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014"},{"key":"ref21","first-page":"448","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"0","journal-title":"Proc Int Conf Mach Learn"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.58"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2778563"},{"key":"ref26","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00061"},{"key":"ref25","first-page":"6730","article-title":"Recurrent attentional reinforcement learning for multi-label image recognition","author":"chen","year":"0","journal-title":"Proc 32nd AAAI Conf Artif Intell"},{"key":"ref50","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00251"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2807588"},{"key":"ref59","article-title":"Deep captioning with multimodal recurrent neural networks (M-RNN)","author":"mao","year":"0"},{"key":"ref58","first-page":"2204","article-title":"Recurrent models of visual attention","author":"mnih","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1108\/00220410410560582"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01267-0_21"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.9"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.120"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298682"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"ref40","article-title":"Actor-critic sequence training for image captioning","author":"zhang","year":"2017"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"ref12","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00728"},{"key":"ref13","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref14","first-page":"898","article-title":"Contrastive learning for image captioning","author":"dai","year":"0","journal-title":"Proc Advances Neural Inf Process Syst"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2015.2477044"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2855422"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2018.2859820"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2746267"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref4","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/87"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2017.2774041"},{"key":"ref6","article-title":"Stack-captioning: Coarse-to-fine learning for image captioning","author":"gu","year":"0","journal-title":"Proc Assoc Advancement Artif Intell"},{"key":"ref5","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1007\/978-1-4615-3618-5_2"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref49","article-title":"Object discovery by generative adversarial & ranking networks","author":"diba","year":"2017"},{"key":"ref9","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/TIP.2016.2545929"},{"key":"ref45","article-title":"Vse++: improved visual-semantic embeddings","author":"faghri","year":"2017"},{"key":"ref48","article-title":"Ranking CGANs: Subjective control over semantic image attributes","author":"saquil","year":"0","journal-title":"Proc Brit Mach Vis Conf"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.670"},{"key":"ref42","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2018.2888822"},{"key":"ref41","first-page":"6706","article-title":"Temporal-difference learning with sampling baseline for image captioning","author":"chen","year":"0","journal-title":"Proc 32nd Conf Artif Intell"},{"key":"ref44","article-title":"Conditional generative adversarial nets","author":"mirza","year":"2014"},{"key":"ref43","first-page":"2672","article-title":"Generative adversarial nets","author":"goodfellow","year":"0","journal-title":"Proc Adv Neural Inf Process Syst"}],"container-title":["IEEE Transactions on Multimedia"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/6046\/9296985\/09146724.pdf?arnumber=9146724","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,5,10]],"date-time":"2022-05-10T14:51:34Z","timestamp":1652194294000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9146724\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"references-count":74,"URL":"https:\/\/doi.org\/10.1109\/tmm.2020.3011317","relation":{},"ISSN":["1520-9210","1941-0077"],"issn-type":[{"value":"1520-9210","type":"print"},{"value":"1941-0077","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]}}}