{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,22]],"date-time":"2026-01-22T08:15:17Z","timestamp":1769069717047,"version":"3.49.0"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030926588","type":"print"},{"value":"9783030926595","type":"electronic"}],"license":[{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2021,1,1]],"date-time":"2021-01-01T00:00:00Z","timestamp":1609459200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2021]]},"DOI":"10.1007\/978-3-030-92659-5_27","type":"book-chapter","created":{"date-parts":[[2022,1,13]],"date-time":"2022-01-13T07:09:18Z","timestamp":1642057758000},"page":"421-436","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":5,"title":["Diverse Image Captioning with\u00a0Grounded Style"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-6261-7941","authenticated-orcid":false,"given":"Franz","family":"Klein","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5413-9142","authenticated-orcid":false,"given":"Shweta","family":"Mahajan","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-9002-9832","authenticated-orcid":false,"given":"Stefan","family":"Roth","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2022,1,13]]},"reference":[{"key":"27_CR1","doi-asserted-by":"crossref","unstructured":"Anderson, P., Fernando, B., Johnson, M., Gould, S.: Guided open vocabulary image captioning with constrained beam search. In: EMNLP, pp. 936\u2013945 (2017)","DOI":"10.18653\/v1\/D17-1098"},{"key":"27_CR2","unstructured":"Anderson, P., Gould, S., Johnson, M.: Partially-supervised image captioning. In: NeurIPS, pp. 1875\u20131886 (2018)"},{"key":"27_CR3","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, pp. 6077\u20136086 (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"27_CR4","doi-asserted-by":"crossref","unstructured":"Aneja, J., Agrawal, H., Batra, D., Schwing, A.: Sequential latent spaces for modeling the intention during diverse image captioning. In: ICCV, pp. 4261\u20134270 (2019)","DOI":"10.1109\/ICCV.2019.00436"},{"key":"27_CR5","doi-asserted-by":"crossref","unstructured":"Aneja, J., Deshpande, A., Schwing, A.G.: Convolutional image captioning. In: CVPR, pp. 5561\u20135570 (2018)","DOI":"10.1109\/CVPR.2018.00583"},{"key":"27_CR6","unstructured":"Baccianella, S., Esuli, A., Sebastiani, F.: SentiWordNet 3.0: an enhanced lexical resource for sentiment analysis and opinion mining. In: LREC, pp. 2200\u20132204 (2010)"},{"key":"27_CR7","unstructured":"Banerjee, S., Lavie, A.: METEOR: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and\/or Summarization, pp. 65\u201372 (2005)"},{"key":"27_CR8","unstructured":"Bird, S., Klein, E., Loper, E.: Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit. O\u2019Reilly Media, Inc. (2009)"},{"key":"27_CR9","doi-asserted-by":"crossref","unstructured":"Chen, C.K., Pan, Z., Liu, M.Y., Sun, M.: Unsupervised stylish image description generation via domain layer norm. In: AAAI, pp. 8151\u20138158 (2019)","DOI":"10.1609\/aaai.v33i01.33018151"},{"key":"27_CR10","unstructured":"Chen, X., et al.: Microsoft COCO captions: data collection and evaluation server. arXiv:1504.00325 (2015)"},{"key":"27_CR11","doi-asserted-by":"crossref","unstructured":"Chen, X., Zitnick, C.L.: Mind\u2019s eye: a recurrent visual representation for image caption generation. In: CVPR, pp. 2422\u20132431 (2015)","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"27_CR12","doi-asserted-by":"crossref","unstructured":"Cui, Y., Jia, M., Lin, T.Y., Song, Y., Belongie, S.: Class-balanced loss based on effective number of samples. In: CVPR, pp. 9268\u20139277 (2019)","DOI":"10.1109\/CVPR.2019.00949"},{"key":"27_CR13","doi-asserted-by":"crossref","unstructured":"Dai, B., Fidler, S., Urtasun, R., Lin, D.: Towards diverse and natural image descriptions via a conditional GAN. In: ICCV, pp. 2970\u20132979 (2017)","DOI":"10.1109\/ICCV.2017.323"},{"key":"27_CR14","doi-asserted-by":"crossref","unstructured":"Deshpande, A., Aneja, J., Wang, L., Schwing, A.G., Forsyth, D.: Fast, diverse and accurate image captioning guided by part-of-speech. In: CVPR, pp. 10695\u201310704 (2019)","DOI":"10.1109\/CVPR.2019.01095"},{"key":"27_CR15","unstructured":"Devlin, J., Gupta, S., Girshick, R., Mitchell, M., Zitnick, C.L.: Exploring nearest neighbor approaches for image captioning. arXiv:1505.04467 (2015)"},{"issue":"4","key":"27_CR16","doi-asserted-by":"publisher","first-page":"677","DOI":"10.1109\/TPAMI.2016.2599174","volume":"39","author":"J Donahue","year":"2017","unstructured":"Donahue, J., et al.: Long-term recurrent convolutional networks for visual recognition and description. TPAMI 39(4), 677\u2013691 (2017)","journal-title":"TPAMI"},{"key":"27_CR17","doi-asserted-by":"crossref","unstructured":"Gan, C., Gan, Z., He, X., Gao, J., Deng, L.: StyleNet: generating attractive visual captions with styles. In: CVPR, pp. 3137\u20133146 (2017)","DOI":"10.1109\/CVPR.2017.108"},{"key":"27_CR18","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: ICCV, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"27_CR19","doi-asserted-by":"crossref","unstructured":"Guo, L., Liu, J., Yao, P., Li, J., Lu, H.: MSCap: multi-style image captioning with unpaired stylized text. In: CVPR, pp. 4204\u20134213 (2019)","DOI":"10.1109\/CVPR.2019.00433"},{"issue":"8","key":"27_CR20","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"27_CR21","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: DenseCap: fully convolutional localization networks for dense captioning. In: CVPR, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"27_CR22","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"300","DOI":"10.1007\/978-3-030-30490-4_25","volume-title":"Artificial Neural Networks and Machine Learning \u2013 ICANN 2019: Text and Time Series","author":"T Karayil","year":"2019","unstructured":"Karayil, T., Irfan, A., Raue, F., Hees, J., Dengel, A.: Conditional GANs for image captioning with sentiments. In: Tetko, I.V., K\u016frkov\u00e1, V., Karpov, P., Theis, F. (eds.) ICANN 2019. LNCS, vol. 11730, pp. 300\u2013312. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-30490-4_25"},{"issue":"4","key":"27_CR23","doi-asserted-by":"publisher","first-page":"664","DOI":"10.1109\/TPAMI.2016.2598339","volume":"39","author":"A Karpathy","year":"2017","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. TPAMI 39(4), 664\u2013676 (2017)","journal-title":"TPAMI"},{"issue":"12","key":"27_CR24","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., et al.: BabyTalk: understanding and generating simple image descriptions. TPAMI 35(12), 2891\u20132903 (2013)","journal-title":"TPAMI"},{"key":"27_CR25","unstructured":"Lin, C.Y.: ROUGE: a package for automatic evaluation of summaries. In: ACL Text Summarization Branches Out, pp. 74\u201381 (2004)"},{"key":"27_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"27_CR27","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: Adaptive attention via a visual sentinel for image captioning. In: CVPR, pp. 3242\u20133250. IEEE Computer Society (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"27_CR28","doi-asserted-by":"crossref","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Neural baby talk. In: CVPR, pp. 7219\u20137228 (2018)","DOI":"10.1109\/CVPR.2018.00754"},{"key":"27_CR29","doi-asserted-by":"crossref","unstructured":"Mahajan, S., Botschen, T., Gurevych, I., Roth, S.: Joint Wasserstein autoencoders for aligning multimodal embeddings. In: ICCVW, pp. 4561\u20134570 (2019)","DOI":"10.1109\/ICCVW.2019.00557"},{"key":"27_CR30","unstructured":"Mahajan, S., Gurevych, I., Roth, S.: Latent normalizing flows for many-to-many cross-domain mappings. In: ICLR (2020)"},{"key":"27_CR31","unstructured":"Mahajan, S., Roth, S.: Diverse image captioning with context-object split latent spaces. In: NeurIPS, pp. 3613\u20133624 (2020)"},{"key":"27_CR32","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Yuille, A.L.: Deep captioning with multimodal recurrent neural networks (m-RNN). In: ICLR (2015)"},{"key":"27_CR33","doi-asserted-by":"crossref","unstructured":"Mathews, A., Xie, L., He, X.: SentiCap: generating image descriptions with sentiments. In: AAAI, pp. 3574\u20133580 (2016)","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"27_CR34","doi-asserted-by":"crossref","unstructured":"Mathews, A.P., Xie, L., He, X.: SemStyle: learning to generate stylised image captions using unaligned text. In: CVPR, pp. 8591\u20138600 (2018)","DOI":"10.1109\/CVPR.2018.00896"},{"key":"27_CR35","unstructured":"Nezami, O.M., Dras, M., Wan, S., Paris, C.: Senti-attend: image captioning using sentiment and attention. arXiv:1811.09789 (2018)"},{"key":"27_CR36","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"270","DOI":"10.1007\/978-3-030-29908-8_22","volume-title":"PRICAI 2019: Trends in Artificial Intelligence","author":"O Mohamad Nezami","year":"2019","unstructured":"Mohamad Nezami, O., Dras, M., Wan, S., Paris, C., Hamey, L.: Towards generating stylized image captions via adversarial training. In: Nayak, A.C., Sharma, A. (eds.) PRICAI 2019. LNCS (LNAI), vol. 11670, pp. 270\u2013284. Springer, Cham (2019). https:\/\/doi.org\/10.1007\/978-3-030-29908-8_22"},{"key":"27_CR37","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"27_CR38","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"85","DOI":"10.1007\/978-3-319-46466-4_6","volume-title":"Computer Vision \u2013 ECCV 2016","author":"G Patterson","year":"2016","unstructured":"Patterson, G., Hays, J.: COCO attributes: attributes for people, animals, and objects. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9910, pp. 85\u2013100. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46466-4_6"},{"key":"27_CR39","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: CVPR, pp. 7008\u20137024 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"27_CR40","doi-asserted-by":"crossref","unstructured":"Shin, A., Ushiku, Y., Harada, T.: Image captioning with sentiment terms via weakly-supervised sentiment dataset. In: BMVC (2016)","DOI":"10.5244\/C.30.53"},{"key":"27_CR41","unstructured":"Vijayakumar, A.K., et al.: Diverse beam search: decoding diverse solutions from neural sequence models. arXiv:1610.02424 (2016)"},{"key":"27_CR42","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"27_CR43","unstructured":"Wang, L., Schwing, A., Lazebnik, S.: Diverse and accurate image description using a variational auto-encoder with an additive Gaussian encoding space. In: NIPS, pp. 5756\u20135766 (2017)"},{"key":"27_CR44","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML, pp. 2048\u20132057 (2015)"},{"key":"27_CR45","doi-asserted-by":"crossref","unstructured":"Yao, T., Pan, Y., Li, Y., Mei, T.: Hierarchy parsing for image captioning. In: ICCV, pp. 2621\u20132629 (2019)","DOI":"10.1109\/ICCV.2019.00271"},{"key":"27_CR46","unstructured":"You, Q., Jin, H., Luo, J.: Image captioning at will: a versatile scheme for effectively injecting sentiments into image descriptions. arXiv:1801.10121 (2018)"},{"key":"27_CR47","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. TACL 2, 67\u201378 (2014)","journal-title":"TACL"},{"key":"27_CR48","doi-asserted-by":"crossref","unstructured":"Zhao, W., Wu, X., Zhang, X.: MemCap: memorizing style knowledge for image captioning. In: AAAI, pp. 12984\u201312992 (2020)","DOI":"10.1609\/aaai.v34i07.6998"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-92659-5_27","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T20:28:18Z","timestamp":1674419298000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-92659-5_27"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021]]},"ISBN":["9783030926588","9783030926595"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-92659-5_27","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2021]]},"assertion":[{"value":"13 January 2022","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"DAGM GCPR","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"DAGM German Conference on Pattern Recognition","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Bonn","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2021","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"28 September 2021","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"1 October 2021","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"43","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"dagm2021","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/www.dagm-gcpr.de\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"116","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"46","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"40% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"2.95","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"4.3","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"The conference was held virtually due to the COVID-19 pandemic.","order":10,"name":"additional_info_on_review_process","label":"Additional Info on Review Process","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}