{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,2,21]],"date-time":"2025-02-21T23:46:01Z","timestamp":1740181561723,"version":"3.37.3"},"reference-count":98,"publisher":"Springer Science and Business Media LLC","issue":"4","license":[{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2020,7,1]],"date-time":"2020-07-01T00:00:00Z","timestamp":1593561600000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["SN COMPUT. SCI."],"published-print":{"date-parts":[[2020,7]]},"DOI":"10.1007\/s42979-020-00238-4","type":"journal-article","created":{"date-parts":[[2020,7,8]],"date-time":"2020-07-08T14:02:31Z","timestamp":1594216951000},"update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":6,"title":["AACR: Feature Fusion Effects of Algebraic Amalgamation Composed Representation on (De)Compositional Network for Caption Generation for Images"],"prefix":"10.1007","volume":"1","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1563-9304","authenticated-orcid":false,"given":"Chiranjib","family":"Sur","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2020,7,8]]},"reference":[{"issue":"5","key":"238_CR1","first-page":"6","volume":"3","author":"P Anderson","year":"2018","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L. Bottom-up and top-down attention for image captioning and visual question answering. CVPR. 2018;3(5):6.","journal-title":"CVPR"},{"key":"238_CR2","unstructured":"Anne HL, et al. Deep compositional captioning: describing novel object categories without paired training data. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016, pp. 1\u201310."},{"key":"238_CR3","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Ha J. Show, observe and tell: attribute-driven attention model for image captioning. In: IJCAI, 2018, pp. 606\u201312.","DOI":"10.24963\/ijcai.2018\/84"},{"key":"238_CR4","doi-asserted-by":"crossref","unstructured":"Chen M, Ding G, Zhao S, Chen H, Liu Q, Han J. Reference based LSTM for image captioning. In: AAAI, 2017, pp. 3981\u201387.","DOI":"10.1609\/aaai.v31i1.11198"},{"key":"238_CR5","unstructured":"Chen H, Zhang H, Chen PY, Yi J, Hsieh CJ Show-and-fool: crafting adversarial examples for neural image captioning. arXiv preprint. 2017; arXiv:1712.02051."},{"key":"238_CR6","doi-asserted-by":"crossref","unstructured":"Chen T, Zhang Z, You Q, Fang C, Wang Z, Jin H, Luo J. Factual or emotional: stylized image captioning with adaptive learning and attention. arXiv preprint. 2018; arXiv:1807.03871.","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"238_CR7","doi-asserted-by":"crossref","unstructured":"Chen F, Ji R, Sun X, Wu Y, Su J. GroupCap: group-based image captioning with structured relevance and diversity constraints. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2018, pp. 1345\u201353.","DOI":"10.1109\/CVPR.2018.00146"},{"key":"238_CR8","doi-asserted-by":"crossref","unstructured":"Chen X, Lawrence Zitnick C. Mind\u2019s eye: a recurrent visual representation for image caption generation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015, pp. 2422\u201331.","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"238_CR9","doi-asserted-by":"crossref","unstructured":"Chen F, Ji R, Su J, Wu Y, Wu Y. Structcap: structured semantic embedding for image captioning. In: Proceedings of the 2017 ACM on multimedia conference, ACM, 2017, pp. 46\u201354.","DOI":"10.1145\/3123266.3123275"},{"key":"238_CR10","unstructured":"Chunseong Park C, Kim B, Kim G. Attend to you: personalized image captioning with context sequence memory networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2017, pp. 895\u2013903."},{"key":"238_CR11","doi-asserted-by":"crossref","unstructured":"Cohn-Gordon R, Goodman N, Potts C. Pragmatically informative image captioning with character-level reference. arXiv preprint. 2018; arXiv:1804.05417.","DOI":"10.18653\/v1\/N18-2070"},{"issue":"2","key":"238_CR12","doi-asserted-by":"publisher","first-page":"48","DOI":"10.1145\/3177745","volume":"14","author":"M Cornia","year":"2018","unstructured":"Cornia M, Baraldi L, Serra G, Cucchiara R. Paying more attention to saliency: image captioning with saliency and context attention. ACM Trans Multimed Comput Commun Appl. 2018;14(2):48.","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"238_CR13","doi-asserted-by":"crossref","unstructured":"Devlin J, et al. Language models for image captioning: the quirks and what works. arXiv preprint. 2015; arXiv:1505.01809.","DOI":"10.3115\/v1\/P15-2017"},{"key":"238_CR14","unstructured":"Devlin J, Gupta S, Girshick R, Mitchell M, Zitnick CL. Exploring nearest neighbor approaches for image captioning. arXiv preprint. 2015; arXiv:1505.04467."},{"key":"238_CR15","doi-asserted-by":"crossref","unstructured":"Donahue J, et al. Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015, pp. 2625\u201334.","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"238_CR16","doi-asserted-by":"crossref","unstructured":"Fang H, et al. From captions to visual concepts and back. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015, pp. 1473\u201382.","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"238_CR17","doi-asserted-by":"crossref","unstructured":"Farhadi A, et al. Every picture tells a story: generating sentences from images. In: European conference on computer vision, Springer, Berlin, Heidelberg, 2010.","DOI":"10.1007\/978-3-642-15561-1_2"},{"issue":"12","key":"238_CR18","doi-asserted-by":"publisher","first-page":"2321","DOI":"10.1109\/TPAMI.2016.2642953","volume":"39","author":"K Fu","year":"2017","unstructured":"Fu K, Jin J, Cui R, Sha F, Zhang C. Aligning where to see and what to tell: Image captioning with region-based attention and scene-specific contexts. IEEE Trans Pattern Anal Mach Intell. 2017;39(12):2321\u201334.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"238_CR19","first-page":"1","volume":"99","author":"K Fu","year":"2018","unstructured":"Fu K, Li J, Jin J, Zhang C. Image-text surgery: efficient concept learning in image captioning by generating pseudopairs. IEEE Trans Neural Netw Learn Syst. 2018;99:1\u201312.","journal-title":"IEEE Trans Neural Netw Learn Syst"},{"key":"238_CR20","doi-asserted-by":"crossref","unstructured":"Gan C, et al. Stylenet: generating attractive visual captions with styles. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, 2017, pp. 3137\u201346.","DOI":"10.1109\/CVPR.2017.108"},{"key":"238_CR21","doi-asserted-by":"crossref","unstructured":"Gan Z, et al. Semantic compositional networks for visual captioning. arXiv preprint. 2016; arXiv:1611.08002.","DOI":"10.1109\/CVPR.2017.127"},{"key":"238_CR22","doi-asserted-by":"crossref","unstructured":"Girshick R, et al. Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2014. pp. 580\u201387.","DOI":"10.1109\/CVPR.2014.81"},{"key":"238_CR23","doi-asserted-by":"crossref","unstructured":"Harzig P, Brehm S, Lienhart R, Kaiser C, Schallner R. Multimodal image captioning for marketing analysis. arXiv preprint. 2018; arXiv:1802.01958.","DOI":"10.1109\/MIPR.2018.00035"},{"key":"238_CR24","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J. Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res. 2013;47:853\u201399.","journal-title":"J Artif Intell Res"},{"key":"238_CR25","doi-asserted-by":"crossref","unstructured":"Jia X, et al. Guiding the long-short term memory model for image caption generation. In: Proceedings of the IEEE international conference on computer vision, 2015, pp. 2407\u201315.","DOI":"10.1109\/ICCV.2015.277"},{"key":"238_CR26","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Chen X, Zhang H, Liu W. Learning to guide decoding for image captioning. arXiv preprint. 2018; arXiv:1804.00887.","DOI":"10.1609\/aaai.v32i1.12283"},{"key":"238_CR27","unstructured":"Jin J, et al. Aligning where to see and what to tell: image caption with region-based attention and scene factorization. arXiv preprint. 2015; arXiv:1506.06272."},{"key":"238_CR28","unstructured":"Karpathy A, Armand J, Fei Fei FL. Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in neural information processing systems, 2014, pp. 1889\u201397."},{"key":"238_CR29","doi-asserted-by":"crossref","unstructured":"Karpathy A, Fei-Fei L. Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015. pp. 3128\u201337.","DOI":"10.1109\/CVPR.2015.7298932"},{"issue":"6","key":"238_CR30","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1049\/iet-cvi.2016.0286","volume":"11","author":"M Kilickaya","year":"2017","unstructured":"Kilickaya M, Akkus BK, Cakici R, Erdem A, Erdem E, Ikizler-Cinbis N. Data-driven image captioning via salient region discovery. IET Comput Vis. 2017;11(6):398\u2013406.","journal-title":"IET Comput Vis"},{"key":"238_CR31","unstructured":"Kiros R, Ruslan S, Zemel RS. Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint. 2014; arXiv:1411.2539."},{"key":"238_CR32","unstructured":"Kiros R, Zemel R, Salakhutdinov Ruslan R. A multiplicative model for learning distributed text-based attribute representations. Adv Neural Inf Process Syst. 2014."},{"key":"238_CR33","unstructured":"Kiros R, Salakhutdinov R, Zemel R. Multimodal neural language models. In: International conference on machine learning, 2014, pp. 595\u2013603."},{"issue":"1","key":"238_CR34","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R, et al. Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis. 2017;123(1):32\u201373.","journal-title":"Int J Comput Vis"},{"issue":"12","key":"238_CR35","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni G, et al. Babytalk: understanding and generating simple image descriptions. IEEE Trans Pattern Anal Mach Intell. 2013;35(12):2891\u2013903.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"10","key":"238_CR36","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova P, et al. TREETALK: composition and compression of trees for image descriptions. TACL. 2014;2(10):351\u201362.","journal-title":"TACL"},{"key":"238_CR37","doi-asserted-by":"crossref","unstructured":"LTran D, et al. Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision, 2015, pp. 4489\u201397.","DOI":"10.1109\/ICCV.2015.510"},{"key":"238_CR38","doi-asserted-by":"crossref","unstructured":"Li X, Wang X, Xu C, Lan W, Wei Q, Yang G, Xu J. COCO-CN for cross-lingual image tagging, captioning and retrieval. arXiv preprint. 2018; arXiv:1805.08661.","DOI":"10.1109\/TMM.2019.2896494"},{"key":"238_CR39","unstructured":"Li S, et al. Composing simple image descriptions using web-scale n-grams. In: Proceedings of the fifteenth conference on computational natural language learning. Association for computational linguistics, 2011."},{"key":"238_CR40","first-page":"3","volume":"3","author":"S Liu","year":"2017","unstructured":"Liu S, Zhu Z, Ye N, Guadarrama S, Murphy K. Improved image captioning via policy gradient optimization of spider. Proc IEEE Int Conf Comput Vis. 2017;3:3.","journal-title":"Proc IEEE Int Conf Comput Vis"},{"key":"238_CR41","doi-asserted-by":"crossref","unstructured":"Liu C, Sun F, Wang C, Wang F, Yuille A. MAT: a multimodal attentive translator for image captioning. arXiv preprint. 2017; arXiv:1702.05658.","DOI":"10.24963\/ijcai.2017\/563"},{"key":"238_CR42","doi-asserted-by":"crossref","unstructured":"Liu X, Li H, Shao J, Chen D, Wang X. Show, tell and discriminate: image captioning by self-retrieval with partially labeled data. arXiv preprint. 2018); arXiv:1803.08314.","DOI":"10.1007\/978-3-030-01267-0_21"},{"key":"238_CR43","doi-asserted-by":"crossref","unstructured":"Liu C, Mao J, Sha F, Yuille AL. Attention correctness in neural image captioning. In: AAAI, 2017, pp. 4176\u201382.","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"238_CR44","first-page":"2","volume":"6","author":"J Lu","year":"2017","unstructured":"Lu J, Xiong C, Parikh D, Socher R. Knowing when to look: adaptive attention via a visual sentinel for image captioning. Proc IEEE Conf Comput Vis Pattern Recognit. 2017;6:2.","journal-title":"Proc IEEE Conf Comput Vis Pattern Recognit"},{"key":"238_CR45","doi-asserted-by":"crossref","unstructured":"Lu D, Whitehead S, Huang L, Ji H, Chang SF. Entity-aware image caption generation. arXiv preprint. 2018; arXiv:1804.07889.","DOI":"10.18653\/v1\/D18-1435"},{"key":"238_CR46","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D, Parikh D. Neural baby talk. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2018, pp. 7219\u201328.","DOI":"10.1109\/CVPR.2018.00754"},{"key":"238_CR47","unstructured":"Mao J, et al. Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint. 2014; arXiv:1412.6632."},{"key":"238_CR48","doi-asserted-by":"crossref","unstructured":"Mao J, et al. Learning like a child: fast novel visual concept learning from sentence descriptions of images. In: Proceedings of the IEEE international conference on computer vision, 2015, pp. 2533\u201341.","DOI":"10.1109\/ICCV.2015.291"},{"key":"238_CR49","doi-asserted-by":"crossref","unstructured":"Mathews AP, Lexing X, Xuming H. SentiCap: generating image descriptions with sentiments. In: Thirtieth AAAI conference on artificial intelligence. 2016.","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"238_CR50","unstructured":"Melnyk I, Sercu T, Dognin PL, Ross J, Mroueh Y. Improved image captioning with adversarial semantic alignment. arXiv preprint. 2018; arXiv:1805.00063."},{"key":"238_CR51","doi-asserted-by":"crossref","unstructured":"Memisevic R, Geoffrey H. Unsupervised learning of image transformations. In: 2007 IEEE Conference on Computer Vision and Pattern Recognition, IEEE, 2007, pp. 1\u20138.","DOI":"10.1109\/CVPR.2007.383036"},{"key":"238_CR52","unstructured":"Mitchell M, et al. Midge: generating image descriptions from computer vision detections. In: Proceedings of the 13th conference of the European chapter of the association for computational linguistics. Association for computational linguistics, 2012."},{"key":"238_CR53","unstructured":"Ordonez V, Girish K, Berg TL. Im2text: describing images using 1 million captioned photographs. In: Advances in neural information processing systems, 2011, pp. 1143\u201351."},{"key":"238_CR54","doi-asserted-by":"crossref","unstructured":"Palangi H, Smolensky P, He X, Deng L. Question-answering with grammatically-interpretable representations. 2017. arXiv:1705.08432","DOI":"10.1609\/aaai.v32i1.12004"},{"issue":"4","key":"238_CR55","doi-asserted-by":"publisher","first-page":"999","DOI":"10.1109\/TPAMI.2018.2824816","volume":"41","author":"CC Park","year":"2018","unstructured":"Park CC, Kim B, Kim G. Towards personalized image captioning via multimodal memory networks. IEEE Trans Pattern Anal Mach. 2018;41(4):999\u201312.","journal-title":"IEEE Trans Pattern Anal Mach."},{"key":"238_CR56","unstructured":"Pu Y, et al. Variational autoencoder for deep learning of images, labels and captions. Adv Neural Inf Process Syst. 2016."},{"key":"238_CR57","doi-asserted-by":"crossref","unstructured":"Ren Z, Wang X, Zhang N, Lv X, Li LJ. Deep reinforcement learning-based image captioning with embedding reward. arXiv preprint. 2017; arXiv:1704.03899.","DOI":"10.1109\/CVPR.2017.128"},{"issue":"2","key":"238_CR58","first-page":"3","volume":"1","author":"SJ Rennie","year":"2017","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V. Self-critical sequence training for image captioning. CVPR. 2017;1(2):3.","journal-title":"CVPR"},{"key":"238_CR59","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R. Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th annual meeting of the association for computational linguistics, 2018, vol. 1, pp. 2556\u201365.","DOI":"10.18653\/v1\/P18-1238"},{"key":"238_CR60","doi-asserted-by":"publisher","first-page":"207","DOI":"10.1162\/tacl_a_00177","volume":"2","author":"R Socher","year":"2014","unstructured":"Socher R, et al. Grounded compositional semantics for finding and describing images with sentences. Trans Assoc Comput Linguist. 2014;2:207\u201318.","journal-title":"Trans Assoc Comput Linguist"},{"issue":"4","key":"238_CR61","doi-asserted-by":"publisher","first-page":"689","DOI":"10.1007\/s12065-019-00278-7","volume":"12","author":"C Sur","year":"2019","unstructured":"Sur C. UCRLF: unified constrained reinforcement learning framework for phase-aware architectures for autonomous vehicle signaling and trajectory optimization. Evol Intell. 2019;12(4):689\u201312.","journal-title":"Evol Intell."},{"issue":"22","key":"238_CR62","doi-asserted-by":"publisher","first-page":"32187","DOI":"10.1007\/s11042-019-08021-1","volume":"78","author":"C Sur","year":"2019","unstructured":"Sur C. Survey of deep learning and architectures for visual captioning-transitioning between media and natural languages. Multimed Tools Appl. 2019;78(22):32187\u2013237.","journal-title":"Multimed Tools Appl."},{"key":"238_CR63","unstructured":"Sur C. Representation for language understanding. Gainesville: University of Florida; 2018. pp. 1\u201390. https:\/\/drive.google.com\/file\/d\/15Fhmt5aM_b0J5jtE9mdWInQPfDS3TqVw."},{"key":"238_CR64","unstructured":"Sur C. SACT: Self-aware multi-space feature composition transformer for multinomial attention for video captioning. 2020; arXiv:2006.14262."},{"key":"238_CR65","unstructured":"Sur C. ReLGAN: generalization of consistency for GAN with disjoint constraints and relative learning of generative processes for multiple transformation learning. 2020; arXiv:2006.07809."},{"key":"238_CR66","unstructured":"Sur C. Self-segregating and coordinated-segregating transformer for focused deep multi-modular network for visual question answering. 202; arXiv:2006.14264."},{"key":"238_CR67","unstructured":"Sur C. Gaussian smoothen semantic features (GSSF)--exploring the linguistic aspects of visual captioning in Indian languages (Bengali) using MSCOCO framework. 2020; arXiv:2002.06701"},{"key":"238_CR68","doi-asserted-by":"crossref","unstructured":"Sur C. MRRC: Multiple role representation crossover interpretation for image captioning with R-CNN feature distribution composition (FDC). 2020;arXiv:2002.06436.","DOI":"10.1007\/s11042-021-10578-9"},{"key":"238_CR69","doi-asserted-by":"crossref","unstructured":"Sur C. aiTPR: Attribute Interaction-Tensor Product Representation for Image Caption. 2020;arXiv:2001.09545.","DOI":"10.1007\/s11063-021-10438-5"},{"key":"238_CR70","doi-asserted-by":"crossref","unstructured":"Sur C. CRUR: Coupled-Recurrent Unit for Unification, Conceptualization and Context Capture for Language Representation--A Generalization of Bi Directional LSTM. 2019;arXiv:1911.10132 .","DOI":"10.1007\/s11042-020-09865-8"},{"key":"238_CR71","unstructured":"Sur C. Tpsgtr: Neural-symbolic tensor product scene-graph-triplet representation for image captioning. 2019;arXiv:1911.10115."},{"key":"238_CR72","doi-asserted-by":"crossref","unstructured":"Sur C, Pei L, Yingjie Z, Dapeng W. Semantic tensor product for image captioning. In: 2019 5th international conference on big data computing and communications (BIGCOM), pp. 33\u201337. IEEE, 2019.","DOI":"10.1109\/BIGCOM.2019.00013"},{"key":"238_CR73","unstructured":"Sur C. Feature Fusion Effects of Tensor Product Representation on (De) Compositional Network for Caption Generation for Images. 2018;arXiv:1812.06624."},{"key":"238_CR74","unstructured":"Sutskever I, James M, Hinton GE. Generating text with recurrent neural networks. In: Proceedings of the 28th international conference on machine learning (ICML-11), 2011."},{"key":"238_CR75","unstructured":"Sutskever I, Vinyals O, Le QV. Sequence to sequence learning with neural networks. In: Advances in neural information processing systems, 2014, pp. 3104\u201312."},{"key":"238_CR76","doi-asserted-by":"crossref","unstructured":"Tavakoliy HR, Shetty R, Borji A, Laaksonen J. Paying attention to descriptions generated by image captioning models. In: Computer vision (ICCV), 2017 IEEE international conference, IEEE, 2017, pp. 2506\u201315.","DOI":"10.1109\/ICCV.2017.272"},{"key":"238_CR77","doi-asserted-by":"crossref","unstructured":"Tran K, et al. Rich image captioning in the wild. In: Proceedings of the IEEE conference on computer vision and pattern recognition workshops, pp. 49\u201356, 2016.","DOI":"10.1109\/CVPRW.2016.61"},{"issue":"4","key":"238_CR78","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2017","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D. Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE Trans Pattern Anal Mach Intell. 2017;39(4):652\u201363.","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"238_CR79","doi-asserted-by":"crossref","unstructured":"Vinyals O, et al. Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2015, pp. 3156\u20133164.","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"2s","key":"238_CR80","first-page":"40","volume":"14","author":"C Wang","year":"2018","unstructured":"Wang C, Haojin Y, Christoph M. Image captioning with deep bidirectional LSTMs and multi-task learning. ACM Trans Multimed Comput Commun Appl. 2018;14(2s):40.","journal-title":"ACM Trans Multimed Comput Commun Appl"},{"key":"238_CR81","doi-asserted-by":"crossref","unstructured":"Wang Y, Lin Z, Shen X, Cohen S, Cottrell GW. Skeleton key: image captioning by skeleton-attribute decomposition. arXiv preprint 2017;arXiv:1704.06972.","DOI":"10.1109\/CVPR.2017.780"},{"issue":"6","key":"238_CR82","doi-asserted-by":"publisher","first-page":"1367","DOI":"10.1109\/TPAMI.2017.2708709","volume":"40","author":"Q Wu","year":"2017","unstructured":"Wu Q, Shen C, Wang P, Dick A, van den Hengel A. Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach. 2017;40(6):1367\u201381.","journal-title":"IEEE Trans Pattern Anal Mach."},{"key":"238_CR83","doi-asserted-by":"publisher","first-page":"100","DOI":"10.1016\/j.image.2018.06.002","volume":"67","author":"C Wu","year":"2018","unstructured":"Wu C, Wei Y, Chu X, Su F, Wang L. Modeling visual and word-conditional semantic attention for image captioning. Signal Process Image Commun. 2018;67:100\u20137.","journal-title":"Signal Process Image Commun."},{"key":"238_CR84","doi-asserted-by":"crossref","unstructured":"Wu Q, et al. What value do explicit high level concepts have in vision to language problems?. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016, pp. 203\u201312.","DOI":"10.1109\/CVPR.2016.29"},{"key":"238_CR85","unstructured":"Wu J, Hu Z, Mooney RJ. Joint image captioning and question answering. arXiv preprint. 2018;arXiv:1805.08389."},{"key":"238_CR86","unstructured":"Xu K, et al. Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning, 2015, pp. 2048\u201357."},{"key":"238_CR87","unstructured":"Yang Z, et al. Review networks for caption generation. In: Advances in neural information processing systems, 2016, pp. 2361\u201369."},{"key":"238_CR88","unstructured":"Yang Y, et al. Corpus-guided sentence generation of natural images. In: Proceedings of the conference on empirical methods in natural language processing. Association for computational linguistics, 2011."},{"key":"238_CR89","unstructured":"Yang Z, Yuan Y, Wu Y, Salakhutdinov R, Cohen WW. Encode, review, and decode: reviewer module for caption generation. arXiv preprint. 2016;arXiv:1605.07912."},{"key":"238_CR90","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T. Incorporating copying mechanism in image captioning for learning novel objects. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR), IEEE, 2017, pp. 5263\u201371.","DOI":"10.1109\/CVPR.2017.559"},{"key":"238_CR91","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T. Boosting image captioning with attributes. In: IEEE international conference on computer vision, ICCV, 2017, pp. 22\u201329.","DOI":"10.1109\/ICCV.2017.524"},{"key":"238_CR92","doi-asserted-by":"crossref","unstructured":"Ye S, Liu N, Han J. Attentive linear transformation for image captioning. IEEE Trans Image Process. 2018.","DOI":"10.1109\/TIP.2018.2855406"},{"key":"238_CR93","unstructured":"You Q, Jin H, Luo J. Image captioning at Will: a versatile scheme for effectively injecting sentiments into image descriptions. arXiv preprint. 2018;arXiv:1801.10121."},{"key":"238_CR94","doi-asserted-by":"crossref","unstructured":"You Q, et al. Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition, 2016, pp. 4651\u201359.","DOI":"10.1109\/CVPR.2016.503"},{"key":"238_CR95","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P, et al. From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans Assoc Comput Linguist. 2014;2:67\u201378.","journal-title":"Trans Assoc Comput Linguist"},{"issue":"1","key":"238_CR96","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1109\/TIP.2018.2855415","volume":"28","author":"M Zhang","year":"2018","unstructured":"Zhang M, Yang Y, Zhang H, Ji Y, Shen HT, Chua TS. More is better: precise and detailed image captioning using online positive recall and missing concepts mining. IEEE Trans Image Process. 2018;28(1):32\u201344.","journal-title":"IEEE Trans Image Process."},{"key":"238_CR97","unstructured":"Zhang L, Sung F, Liu F, Xiang T, Gong S, Yang Y, Hospedales TM. Actor-critic sequence training for image captioning. arXiv preprint. 2017; arXiv:1706.09601."},{"key":"238_CR98","doi-asserted-by":"crossref","unstructured":"Zhao W, Wang B, Ye J, Yang M, Zhao Z, Luo R, Qiao Y. A multi-task learning approach for image captioning. In: IJCAI, 2018, pp. 1205\u201311.","DOI":"10.24963\/ijcai.2018\/168"}],"container-title":["SN Computer Science"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-020-00238-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s42979-020-00238-4\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s42979-020-00238-4.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,10,3]],"date-time":"2023-10-03T17:50:35Z","timestamp":1696355435000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s42979-020-00238-4"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2020,7]]},"references-count":98,"journal-issue":{"issue":"4","published-print":{"date-parts":[[2020,7]]}},"alternative-id":["238"],"URL":"https:\/\/doi.org\/10.1007\/s42979-020-00238-4","relation":{},"ISSN":["2662-995X","2661-8907"],"issn-type":[{"type":"print","value":"2662-995X"},{"type":"electronic","value":"2661-8907"}],"subject":[],"published":{"date-parts":[[2020,7]]},"assertion":[{"value":"26 January 2020","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"19 June 2020","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"8 July 2020","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Compliance with Ethical Standards"}},{"value":"On behalf of all authors, the corresponding author states that there is no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflict of Interest"}}],"article-number":"229"}}