{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,7,11]],"date-time":"2026-07-11T17:34:08Z","timestamp":1783791248885,"version":"3.55.0"},"reference-count":255,"publisher":"Institute of Electrical and Electronics Engineers (IEEE)","issue":"1","license":[{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/ieeexplore.ieee.org\/Xplorehelp\/downloads\/license-information\/IEEE.html"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2023,1,1]],"date-time":"2023-01-01T00:00:00Z","timestamp":1672531200000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"funder":[{"DOI":"10.13039\/501100000780","name":"European Commission","doi-asserted-by":"publisher","award":["951847"],"award-info":[{"award-number":["951847"]}],"id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"publisher"}]},{"DOI":"10.13039\/501100000780","name":"European Commission","doi-asserted-by":"publisher","award":["952026"],"award-info":[{"award-number":["952026"]}],"id":[{"id":"10.13039\/501100000780","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Fondazione di Modena"},{"name":"Italian Ministry of Foreign Affairs and International Cooperation"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEEE Trans. Pattern Anal. Mach. Intell."],"published-print":{"date-parts":[[2023,1,1]]},"DOI":"10.1109\/tpami.2022.3148210","type":"journal-article","created":{"date-parts":[[2022,2,7]],"date-time":"2022-02-07T21:04:55Z","timestamp":1644267895000},"page":"539-559","source":"Crossref","is-referenced-by-count":297,"title":["From Show to Tell: A Survey on Deep Learning-Based Image Captioning"],"prefix":"10.1109","volume":"45","author":[{"given":"Matteo","family":"Stefanini","sequence":"first","affiliation":[{"name":"Department of Engineering &#x201C;Enzo Ferrari&#x201D;, University of Modena and Reggio Emilia, Modena, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9640-9385","authenticated-orcid":false,"given":"Marcella","family":"Cornia","sequence":"additional","affiliation":[{"name":"Department of Engineering &#x201C;Enzo Ferrari&#x201D;, University of Modena and Reggio Emilia, Modena, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5125-4957","authenticated-orcid":false,"given":"Lorenzo","family":"Baraldi","sequence":"additional","affiliation":[{"name":"Department of Engineering &#x201C;Enzo Ferrari&#x201D;, University of Modena and Reggio Emilia, Modena, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-7885-6050","authenticated-orcid":false,"given":"Silvia","family":"Cascianelli","sequence":"additional","affiliation":[{"name":"Department of Engineering &#x201C;Enzo Ferrari&#x201D;, University of Modena and Reggio Emilia, Modena, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8687-6609","authenticated-orcid":false,"given":"Giuseppe","family":"Fiameni","sequence":"additional","affiliation":[{"name":"NVIDIA AI Technology Centre, Milan, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Rita","family":"Cucchiara","sequence":"additional","affiliation":[{"name":"Department of Engineering &#x201C;Enzo Ferrari&#x201D;, University of Modena and Reggio Emilia, Modena, Italy"}],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"263","reference":[{"key":"ref170","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.323"},{"key":"ref172","first-page":"915","article-title":"BERTTune: Fine-tuning neural machine translation with BERTScore","author":"unanue","year":"2021","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref171","article-title":"BERTScore: Evaluating text generation with BERT","author":"zhang","year":"2020","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref174","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_13"},{"key":"ref173","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1220"},{"key":"ref176","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.8"},{"key":"ref175","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.595"},{"key":"ref178","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00904"},{"key":"ref177","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.130"},{"key":"ref168","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-short.29"},{"key":"ref169","first-page":"656","article-title":"A neural compositional paradigm for image captioning","author":"dai","year":"2018","journal-title":"Proc 32nd Int Conf Neural Inf Process Syst"},{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.90"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.131"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.503"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.277"},{"key":"ref31","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"ref30","first-page":"2422","article-title":"Mind's eye: A recurrent visual representation for image caption generation","author":"chen","year":"2015","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00146"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123275"},{"key":"ref35","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.138"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.29"},{"key":"ref181","doi-asserted-by":"publisher","DOI":"10.1145\/3240508.3240640"},{"key":"ref180","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01278"},{"key":"ref185","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00425"},{"key":"ref184","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01246-5_31"},{"key":"ref183","first-page":"1575","article-title":"VIVO: Visual vocabulary pre-training for novel object captioning","author":"hu","year":"2020","journal-title":"Proc 35th AAAI Conf Artif Intell"},{"key":"ref182","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D17-1098"},{"key":"ref189","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1208"},{"key":"ref188","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/128"},{"key":"ref187","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.01042"},{"key":"ref186","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00751"},{"key":"ref28","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref27","article-title":"Deep captioning with multimodal recurrent neural networks (m-RNN)","author":"mao","year":"2015","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref179","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.559"},{"key":"ref29","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-49724-8_2"},{"key":"ref22","doi-asserted-by":"publisher","DOI":"10.1109\/PARC49193.2020.236619"},{"key":"ref21","doi-asserted-by":"publisher","DOI":"10.1007\/s00371-018-1566-y"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref26","first-page":"1097","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"Proc 25th Int Conf Neural Inf Process Syst"},{"key":"ref25","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref50","first-page":"2369","article-title":"Review networks for caption generation","author":"yang","year":"2016","journal-title":"Proc 30th Int Conf Neural Inf Process Syst"},{"key":"ref51","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.667"},{"key":"ref154","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.445"},{"key":"ref153","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.coling-main.210"},{"key":"ref156","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00432"},{"key":"ref155","first-page":"1730","article-title":"Measuring the diversity of automatic image descriptions","author":"van miltenburg","year":"2018","journal-title":"Proc 27th Int Conf Comput Linguistics"},{"key":"ref150","first-page":"65","article-title":"METEOR: An automatic metric for MT evaluation with improved correlation with human judgments","author":"banerjee","year":"2005","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref152","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00608"},{"key":"ref151","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01237-3_3"},{"key":"ref146","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46493-0_1"},{"key":"ref147","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_17"},{"key":"ref148","first-page":"1060","article-title":"Generative adversarial text to image synthesis","author":"reed","year":"2016","journal-title":"Proc 33rd Int Conf Mach Learn"},{"key":"ref149","article-title":"Transparent human evaluation for image captioning","author":"kasai","year":"2021"},{"key":"ref59","first-page":"91","article-title":"Faster R-CNN: Towards real-time object detection with region proposal networks","author":"ren","year":"2015","journal-title":"Proc 28th Int Conf Neural Inf Process Syst"},{"key":"ref58","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00636"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01252-6_5"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1145\/3177745"},{"key":"ref55","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.334"},{"key":"ref54","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.272"},{"key":"ref53","article-title":"Seeing with humans: Gaze-assisted neural image captioning","author":"sugano","year":"2016"},{"key":"ref52","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_31"},{"key":"ref40","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.524"},{"key":"ref167","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01383"},{"key":"ref166","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.93"},{"key":"ref165","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.eval4nlp-1.4"},{"key":"ref164","first-page":"1152","article-title":"Explore and Explain: Self-supervised navigation and recounting","author":"bigazzi","year":"2020","journal-title":"Proc 25th Int Conf Pattern Recognit"},{"key":"ref163","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00850"},{"key":"ref162","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/E17-1019"},{"key":"ref161","first-page":"957","article-title":"From word embeddings to document distances","author":"kusner","year":"2015","journal-title":"Proc 32nd Int Conf Mach Learn"},{"key":"ref160","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58571-6_37"},{"key":"ref4","first-page":"1143","article-title":"Im2Text: Describing images using 1 million captioned photographs","author":"ordonez","year":"2011","journal-title":"Proc 24th Int Conf Neural Inf Process Syst"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"ref6","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"Proc Int Conf Neural Inf Process Syst Workshops"},{"key":"ref5","first-page":"2121","article-title":"DeViSE: A deep visual-semantic embedding model","author":"frome","year":"2013","journal-title":"Proc 26th Int Conf Neural Inf Process Syst"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"ref159","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1156"},{"key":"ref7","first-page":"1889","article-title":"Deep fragment embeddings for bidirectional image sentence mapping","author":"karpathy","year":"2014","journal-title":"Proc 27th Int Conf Neural Inf Process Syst"},{"key":"ref49","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12266"},{"key":"ref157","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2020.3013834"},{"key":"ref9","first-page":"1250","article-title":"Generating image descriptions using dependency relational patterns","author":"aker","year":"2010","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref158","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1437"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00834"},{"key":"ref45","article-title":"Neural machine translation by jointly learning to align and translate","author":"bahdanau","year":"2014","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00184"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.780"},{"key":"ref42","first-page":"2048","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"Proc 32nd Int Conf Mach Learn"},{"key":"ref41","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.127"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01228-1_18"},{"key":"ref43","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.345"},{"key":"ref73","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00271"},{"key":"ref72","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.664"},{"key":"ref71","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01094"},{"key":"ref70","article-title":"Semi-supervised classification with graph convolutional networks","author":"kipf","year":"2017","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref76","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00902"},{"key":"ref77","article-title":"Image captioning: Transforming objects into words","author":"herdade","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref74","first-page":"6000","article-title":"Attention is all you need","author":"vaswani","year":"2017","journal-title":"Proc 31st Int Conf Neural Inf Process Syst"},{"key":"ref75","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00435"},{"key":"ref78","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01034"},{"key":"ref79","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00473"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"ref62","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00898"},{"key":"ref61","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"ref63","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00856"},{"key":"ref64","article-title":"Adaptively aligned image captioning via adaptive attention time","author":"huang","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref65","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6898"},{"key":"ref66","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2019.2909864"},{"key":"ref67","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.140"},{"key":"ref68","first-page":"711","article-title":"Exploring visual relationship for image captioning","author":"yao","year":"2018","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref69","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350943"},{"key":"ref197","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.356"},{"key":"ref198","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.364"},{"key":"ref199","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2018\/592"},{"key":"ref193","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.214"},{"key":"ref194","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v33i01.33018650"},{"key":"ref195","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00640"},{"key":"ref196","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00643"},{"key":"ref95","article-title":"How much can CLIP benefit vision-and-language tasks?","author":"shen","year":"2021"},{"key":"ref94","article-title":"SimVLM: Simple visual language model pretraining with weak supervision","author":"wang","year":"2021"},{"key":"ref190","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3060948"},{"key":"ref93","article-title":"Learning transferable visual models from natural language supervision","author":"radford","year":"2021"},{"key":"ref191","article-title":"RATT: Recurrent attention to transient tasks for continual image captioning","author":"del chiaro","year":"2020","journal-title":"Proc 34th Int Conf Neural Inf Process Syst"},{"key":"ref92","article-title":"CPTR: Full transformer network for image captioning","author":"liu","year":"2021"},{"key":"ref192","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.494"},{"key":"ref91","first-page":"10347","article-title":"Training data-efficient image transformers & distillation through attention","author":"touvron","year":"2021","journal-title":"Proc 38th Int Conf Mach Learn"},{"key":"ref90","article-title":"An image is worth 16x16 words: Transformers for image recognition at scale","author":"dosovitskiy","year":"2021","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref98","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D19-1514"},{"key":"ref99","article-title":"ViLBERT: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks","author":"lu","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref96","article-title":"ClipCap: CLIP prefix for image captioning","author":"mokady","year":"2021"},{"key":"ref97","article-title":"Universal captioner: Long-tail vision-and-language model training through content-style separation","author":"cornia","year":"2021"},{"key":"ref82","first-page":"153","article-title":"Image captioning through image transformer","author":"he","year":"2020","journal-title":"Proc Asian Conf Comput Vis"},{"key":"ref81","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01059"},{"key":"ref84","doi-asserted-by":"publisher","DOI":"10.1109\/ICRA40945.2020.9196653"},{"key":"ref83","article-title":"Prophet attention: Predicting attention with future attention","author":"liu","year":"2020","journal-title":"Proc 34th Int Conf Neural Inf Process Syst"},{"key":"ref80","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01098"},{"key":"ref89","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01521"},{"key":"ref85","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i2.16258"},{"key":"ref86","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i3.16328"},{"key":"ref87","first-page":"6847","article-title":"Aligning visual regions and textual concepts for semantic-grounded image representations","author":"liu","year":"2019","journal-title":"Proc Int Conf Neural Inf Process"},{"key":"ref88","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01028"},{"key":"ref200","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01216-8_45"},{"key":"ref101","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.7005"},{"key":"ref100","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"ref209","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58568-6_34"},{"key":"ref203","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413753"},{"key":"ref204","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00136"},{"key":"ref201","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350961"},{"key":"ref202","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00864"},{"key":"ref207","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/D18-1436"},{"key":"ref208","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00472"},{"key":"ref205","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v35i4.16476"},{"key":"ref206","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01245"},{"key":"ref211","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00210"},{"key":"ref210","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2021.3074803"},{"key":"ref212","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00275"},{"key":"ref213","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v32i1.12340"},{"key":"ref214","first-page":"898","article-title":"Contrastive learning for image captioning","author":"dai","year":"2017","journal-title":"Proc 31st Int Conf Neural Inf Process Syst"},{"key":"ref215","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00434"},{"key":"ref216","first-page":"5758","article-title":"Diverse and accurate image description using a variational auto-encoder with an additive gaussian encoding space","author":"wang","year":"2017","journal-title":"Proc 31th Int Conf Neural Inf Process Syst"},{"key":"ref217","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2019.00436"},{"key":"ref218","first-page":"1929","article-title":"Variational structured semantic inference for diverse image captioning","author":"chen","year":"2019","journal-title":"Proc 33rd Int Conf Neural Inf Process Syst"},{"key":"ref219","article-title":"Diverse image captioning with context-object split latent spaces","author":"mahajan","year":"2020","journal-title":"Proc 34th Int Conf Neural Inf Process Syst"},{"key":"ref220","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01095"},{"key":"ref222","doi-asserted-by":"publisher","DOI":"10.1109\/TMM.2019.2896494"},{"key":"ref221","article-title":"Multilingual image description with neural sequence models","author":"elliott","year":"2015","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref229","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.acl-long.387"},{"key":"ref228","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01354"},{"key":"ref227","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1240"},{"key":"ref226","doi-asserted-by":"publisher","DOI":"10.1145\/3343031.3350996"},{"key":"ref225","doi-asserted-by":"publisher","DOI":"10.1145\/3123266.3123366"},{"key":"ref224","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3210"},{"key":"ref223","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1168"},{"key":"ref127","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P16-1162"},{"key":"ref126","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01101"},{"key":"ref125","article-title":"XGPT: Cross-modal generative pre-training for image captioning","author":"xia","year":"2020"},{"key":"ref124","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00646"},{"key":"ref129","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00166"},{"key":"ref128","first-page":"740","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref130","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/P18-1238"},{"key":"ref133","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2016.13"},{"key":"ref134","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_1"},{"key":"ref131","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00356"},{"key":"ref132","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58520-4_25"},{"key":"ref232","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.01305"},{"key":"ref233","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.542"},{"key":"ref230","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV48922.2021.00537"},{"key":"ref231","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.118"},{"key":"ref239","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.108"},{"key":"ref238","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i05.6503"},{"key":"ref235","doi-asserted-by":"publisher","DOI":"10.1145\/2998181.2998364"},{"key":"ref234","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2021.emnlp-main.419"},{"key":"ref237","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2018.2824816"},{"key":"ref236","first-page":"6432","article-title":"Attend to you: Personalized image captioning with context sequence memory networks","author":"chunseong park","year":"2017","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"ref136","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01275"},{"key":"ref135","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2017.2721945"},{"key":"ref138","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58558-7_38"},{"key":"ref137","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58536-5_44"},{"key":"ref139","article-title":"Zero-shot text-to-image generation","author":"ramesh","year":"2021"},{"key":"ref140","doi-asserted-by":"publisher","DOI":"10.1145\/2812802"},{"key":"ref141","doi-asserted-by":"publisher","DOI":"10.1145\/3404835.3463257"},{"key":"ref142","first-page":"4904","article-title":"Scaling up visual and vision-language representation learning with noisy text supervision","author":"jia","year":"2021","journal-title":"Proc 38th Int Conf Mach Learn"},{"key":"ref143","first-page":"2010","article-title":"Caltech-UCSD birds 200","author":"welinder","year":"2010"},{"key":"ref2","first-page":"1987","article-title":"Automatic image captioning","author":"pan","year":"2004","journal-title":"Proc IEEE Int Conf Multimedia Expo"},{"key":"ref144","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref1","doi-asserted-by":"publisher","DOI":"10.1155\/2015\/565871"},{"key":"ref145","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.64"},{"key":"ref241","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00433"},{"key":"ref242","doi-asserted-by":"publisher","DOI":"10.1609\/aaai.v34i07.6998"},{"key":"ref243","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.01280"},{"key":"ref244","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2019.00859"},{"key":"ref240","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00896"},{"key":"ref248","first-page":"211","article-title":"Comprehensive image captioning via scene graph decomposition","author":"zhong","year":"2020","journal-title":"Proc Eur Conf Comput Vis"},{"key":"ref247","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00998"},{"key":"ref246","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01657"},{"key":"ref245","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.01249"},{"key":"ref249","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58601-0_42"},{"key":"ref109","article-title":"Improving language understanding by generative pre-training","author":"radford","year":"2018"},{"key":"ref108","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00583"},{"key":"ref107","article-title":"AutoCaption: Image captioning with neural architecture search","author":"zhu","year":"2020"},{"key":"ref106","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2018.00754"},{"key":"ref105","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref104","article-title":"Scaling up vision-language pre-training for image captioning","author":"hu","year":"2021"},{"key":"ref103","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"ref102","first-page":"4171","article-title":"BERT: Pre-training of deep bidirectional transformers for language understanding","author":"devlin","year":"2018","journal-title":"Proc Conf North Amer Chapter Assoc Comput Linguistics"},{"key":"ref111","doi-asserted-by":"publisher","DOI":"10.24963\/ijcai.2020\/107"},{"key":"ref112","doi-asserted-by":"publisher","DOI":"10.1145\/3394171.3413901"},{"key":"ref110","article-title":"Fast image caption generation with position alignment","author":"fei","year":"2019","journal-title":"Proc AAAI Conf Artif Intell Workshops"},{"key":"ref250","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR42600.2020.00486"},{"key":"ref251","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/W16-3203"},{"key":"ref254","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58577-8_7"},{"key":"ref255","first-page":"3319","article-title":"Axiomatic attribution for deep networks","author":"sundararajan","year":"2017","journal-title":"Proc 34th Int Conf Mach Learn"},{"key":"ref252","article-title":"Going beneath the surface: Evaluating image captioning for grammaticality, truthfulness and diversity","author":"xie","year":"2019"},{"key":"ref253","doi-asserted-by":"publisher","DOI":"10.18653\/v1\/2020.acl-main.583"},{"key":"ref10","first-page":"444","article-title":"Corpus-guided sentence generation of natural images","author":"yang","year":"2011","journal-title":"Proc Conf Empir Methods Natural Lang Process"},{"key":"ref11","first-page":"220","article-title":"Composing simple image descriptions using web-scale n-grams","author":"li","year":"2011","journal-title":"Proc 15th Conf Comput Natural Lang Learn"},{"key":"ref12","first-page":"606","article-title":"Choosing linguistics over vision to describe images","author":"gupta","year":"2012","journal-title":"Proc 26th AAAI Conf Artif Intell"},{"key":"ref13","first-page":"747","article-title":"Midge: Generating image descriptions from computer vision detections","author":"mitchell","year":"2012","journal-title":"Proc 13th Conf Eur Chapter Assoc Comput Linguistics"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2012.162"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1162\/tacl_a_00188"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1613\/jair.4900"},{"key":"ref118","first-page":"74","article-title":"ROUGE: A package for automatic evaluation of summaries","author":"lin","year":"2004","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1016\/j.neucom.2018.05.080"},{"key":"ref117","first-page":"311","article-title":"BLEU: A method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"Proc Annual Meeting of the Assoc Computational Linguistics"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1145\/3295748"},{"key":"ref19","doi-asserted-by":"publisher","DOI":"10.1613\/jair.3994"},{"key":"ref119","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2017.128"},{"key":"ref114","doi-asserted-by":"publisher","DOI":"10.1017\/CBO9780511815829"},{"key":"ref113","article-title":"Fast sequence generation with multi-agent reinforcement learning","author":"guo","year":"2021"},{"key":"ref116","doi-asserted-by":"publisher","DOI":"10.1007\/BF00992696"},{"key":"ref115","article-title":"Sequence level training with recurrent neural networks","author":"ranzato","year":"2016","journal-title":"Proc Int Conf Learn Representations"},{"key":"ref120","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2017.100"},{"key":"ref121","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"ref122","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"ref123","article-title":"Actor-critic sequence training for image captioning","author":"zhang","year":"2017","journal-title":"Proc Int Conf Neural Inf Process"}],"container-title":["IEEE Transactions on Pattern Analysis and Machine Intelligence"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/34\/9970415\/09706348.pdf?arnumber=9706348","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,12,26]],"date-time":"2022-12-26T19:13:58Z","timestamp":1672082038000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/9706348\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,1,1]]},"references-count":255,"journal-issue":{"issue":"1"},"URL":"https:\/\/doi.org\/10.1109\/tpami.2022.3148210","relation":{},"ISSN":["0162-8828","2160-9292","1939-3539"],"issn-type":[{"value":"0162-8828","type":"print"},{"value":"2160-9292","type":"electronic"},{"value":"1939-3539","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,1,1]]}}}