{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,24]],"date-time":"2026-02-24T07:59:48Z","timestamp":1771919988048,"version":"3.50.1"},"publisher-location":"Cham","reference-count":50,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030585709","type":"print"},{"value":"9783030585716","type":"electronic"}],"license":[{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2020,1,1]],"date-time":"2020-01-01T00:00:00Z","timestamp":1577836800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2020]]},"DOI":"10.1007\/978-3-030-58571-6_37","type":"book-chapter","created":{"date-parts":[[2020,11,8]],"date-time":"2020-11-08T16:02:34Z","timestamp":1604851354000},"page":"629-644","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":21,"title":["Towards Unique and Informative Captioning of Images"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-7057-1613","authenticated-orcid":false,"given":"Zeyu","family":"Wang","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-1843-2165","authenticated-orcid":false,"given":"Berthy","family":"Feng","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9894-9983","authenticated-orcid":false,"given":"Karthik","family":"Narasimhan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-5272-3241","authenticated-orcid":false,"given":"Olga","family":"Russakovsky","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,11,9]]},"reference":[{"key":"37_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: SPICE: semantic propositional image caption evaluation. In: ECCV (2016)","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"37_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"37_CR3","unstructured":"Bahl, L., Brown, P., de Souza, P., Mercer, R.: Maximum mutual information estimation of hidden Markov model parameters for speech recognition. In: ICASSP (1986)"},{"key":"37_CR4","doi-asserted-by":"crossref","unstructured":"Cui, Y., Yang, G., Veit, A., Huang, X., Belongie, S.: Learning to evaluate image captioning. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00608"},{"key":"37_CR5","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1016\/j.eswa.2016.09.039","volume":"68","author":"D Datta","year":"2017","unstructured":"Datta, D., Varma, S., Chowdary, C.R., Singh, S.K.: Multimodal retrieval using mutual information based textual query reformulation. Expert Syst. Appl. 68, 81\u201392 (2017)","journal-title":"Expert Syst. Appl."},{"key":"37_CR6","doi-asserted-by":"crossref","unstructured":"Dognin, P., Melnyk, I., Mroueh, Y., Ross, J., Sercu, T.: Adversarial semantic alignment for improved image captions. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01071"},{"key":"37_CR7","doi-asserted-by":"crossref","unstructured":"Henning, C.A., Ewerth, R.: Estimating the information gap between textual and visual representations. In: ICMR (2017)","DOI":"10.1145\/3078971.3078991"},{"key":"37_CR8","doi-asserted-by":"crossref","unstructured":"Huang, L., Wang, W., Chen, J., Wei, X.Y.: Attention on attention for image captioning. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00473"},{"key":"37_CR9","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"37_CR10","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Image retrieval using scene graphs. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"37_CR11","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"37_CR12","unstructured":"Kimura, R., Iida, S., Cui, H., Hung, P.H., Utsuro, T., Nagata, M.: Selecting informative context sentence by forced back-translation. In: MT Summit XVII (2019)"},{"key":"37_CR13","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., Fei-Fei, L.: A hierarchical approach for generating descriptive image paragraphs. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.356"},{"issue":"1","key":"37_CR14","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna, R., et al.: Visual genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vision 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vision"},{"issue":"12","key":"37_CR15","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., et al.: BabyTalk: understanding and generating simple image descriptions. IEEE Trans. Pattern Anal. Mach. Intell. 35(12), 2891\u20132903 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"37_CR16","doi-asserted-by":"crossref","unstructured":"Lavie, A., Agarwal, A.: Meteor: an automatic metric for MT evaluation with high levels of correlation with human judgments. In: StatMT (2007)","DOI":"10.3115\/1626355.1626389"},{"key":"37_CR17","doi-asserted-by":"crossref","unstructured":"Li, J., Galley, M., Brockett, C., Gao, J., Dolan, B.: A diversity-promoting objective function for neural conversation models. In: NAACL HLT (2016)","DOI":"10.18653\/v1\/N16-1014"},{"key":"37_CR18","unstructured":"Li, J., Jurafsky, D.: Mutual Information and Diverse Decoding Improve Neural Machine Translation. arXiv:1601.00372 [cs] (2016). arXiv: 1601.00372"},{"key":"37_CR19","doi-asserted-by":"crossref","unstructured":"Li, W., et al.: Object-driven text-to-image synthesis via adversarial training. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.01245"},{"key":"37_CR20","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., et al.: Microsoft COCO: common objects in context. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"37_CR21","doi-asserted-by":"crossref","unstructured":"Lindh, A., Ross, R.J., Mahalunkar, A., Salton, G., Kelleher, J.D.: Generating diverse and meaningful captions. In: ICANN (2018)","DOI":"10.1007\/978-3-030-01418-6_18"},{"key":"37_CR22","doi-asserted-by":"crossref","unstructured":"Liu, L., Tang, J., Wan, X., Guo, Z.: Generating diverse and descriptive image captions using visual paraphrases. In: ICCV (2019)","DOI":"10.1109\/ICCV.2019.00434"},{"key":"37_CR23","doi-asserted-by":"crossref","unstructured":"Liu, S., Zhu, Z., Ye, N., Guadarrama, S., Murphy, K.: Improved image captioning via policy gradient optimization of SPIDEr. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.100"},{"key":"37_CR24","doi-asserted-by":"crossref","unstructured":"Liu, X., Li, H., Shao, J., Chen, D., Wang, X.: Show, tell and discriminate: image captioning by self-retrieval with partially labeled data. In: ECCV (2018)","DOI":"10.1007\/978-3-030-01267-0_21"},{"key":"37_CR25","doi-asserted-by":"crossref","unstructured":"Lu, D., Whitehead, S., Huang, L., Ji, H., Chang, S.F.: Entity-aware image caption generation. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1435"},{"key":"37_CR26","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: knowing when to look: adaptive attention via a visual sentinel for image captioning. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"37_CR27","doi-asserted-by":"crossref","unstructured":"Luo, R., Shakhnarovich, G., Cohen, S., Price, B.: Discriminability objective for training descriptive captions. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00728"},{"key":"37_CR28","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"37_CR29","doi-asserted-by":"crossref","unstructured":"Melas-Kyriazi, L., Rush, A., Han, G.: Training for diversity in image paragraph captioning. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1084"},{"key":"37_CR30","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2001)","DOI":"10.3115\/1073083.1073135"},{"key":"37_CR31","doi-asserted-by":"crossref","unstructured":"Povey, D., Woodland, P.: Minimum phone error and I-smoothing for improved discriminative training. In: ICASSP (2002)","DOI":"10.1109\/ICASSP.2002.1005687"},{"key":"37_CR32","doi-asserted-by":"crossref","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. (2017)","DOI":"10.1109\/TPAMI.2016.2577031"},{"key":"37_CR33","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Hendricks, L.A., Burns, K., Darrell, T., Saenko, K.: Object hallucination in image captioning. In: EMNLP (2018)","DOI":"10.18653\/v1\/D18-1437"},{"key":"37_CR34","doi-asserted-by":"crossref","unstructured":"Shetty, R., Rohrbach, M., Hendricks, L.A., Fritz, M., Schiele, B.: speaking the same language: matching machine to human captions by adversarial training. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.445"},{"key":"37_CR35","unstructured":"Jones, K.S.: A statistical interpretation of term specificity and its application in retrieval. J. Doc. (1972)"},{"key":"37_CR36","doi-asserted-by":"crossref","unstructured":"Tu, Z., Liu, Y., Shang, L., Liu, X., Li, H.: Neural machine translation with reconstruction. In: AAAI (2017)","DOI":"10.1609\/aaai.v31i1.10950"},{"key":"37_CR37","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Bengio, S., Murphy, K., Parikh, D., Chechik, G.: Context-aware captions from context-agnostic supervision. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.120"},{"key":"37_CR38","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: CIDEr: consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"37_CR39","doi-asserted-by":"crossref","unstructured":"Vijayakumar, A.K., et al.: Diverse beam search for improved description of complex scenes. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12340"},{"key":"37_CR40","unstructured":"Vijayakumar, A.K., et al.: Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models. arXiv:1610.02424 [cs] (2018). arXiv: 1610.02424"},{"key":"37_CR41","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"37_CR42","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and Tell: Lessons Learned from the 2015 MSCOCO Image Captioning Challenge (2017)","DOI":"10.1109\/TPAMI.2016.2587640"},{"key":"37_CR43","doi-asserted-by":"crossref","unstructured":"Wang, Q., Chan, A.B.: Describing like humans: on diversity in image captioning. In: CVPR (2019)","DOI":"10.1109\/CVPR.2019.00432"},{"key":"37_CR44","doi-asserted-by":"crossref","unstructured":"Wu, B., Jia, F., Liu, W., Ghanem, B.: Diverse image annotation. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.656"},{"key":"37_CR45","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"37_CR46","doi-asserted-by":"crossref","unstructured":"Xu, T., et al.: AttnGAN: fine-grained text to image generation with attentional generative adversarial networks. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00143"},{"key":"37_CR47","doi-asserted-by":"crossref","unstructured":"Yao, T., Mei, T., Ngo, C.W.: Co-reranking by mutual reinforcement for image search. In: CVPR (2010)","DOI":"10.1145\/1816041.1816048"},{"key":"37_CR48","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"37_CR49","doi-asserted-by":"crossref","unstructured":"Zhang, H., et al.: StackGAN: text to photo-realistic image synthesis with stacked generative adversarial networks. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.629"},{"key":"37_CR50","unstructured":"Zhang, Y., et al.: Generating informative and diverse conversational responses via adversarial information maximization. In: NeurIPS (2018)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2020"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-58571-6_37","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,11,8]],"date-time":"2024-11-08T00:12:06Z","timestamp":1731024726000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-58571-6_37"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020]]},"ISBN":["9783030585709","9783030585716"],"references-count":50,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-58571-6_37","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2020]]},"assertion":[{"value":"9 November 2020","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Glasgow","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"United Kingdom","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2020","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"23 August 2020","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 August 2020","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2020","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2020.eu\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"OpenReview","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"5025","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"1360","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"27% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}