{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,25]],"date-time":"2026-02-25T17:11:13Z","timestamp":1772039473635,"version":"3.50.1"},"reference-count":172,"publisher":"Springer Science and Business Media LLC","issue":"22","license":[{"start":{"date-parts":[[2019,7,31]],"date-time":"2019-07-31T00:00:00Z","timestamp":1564531200000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2019,7,31]],"date-time":"2019-07-31T00:00:00Z","timestamp":1564531200000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"published-print":{"date-parts":[[2019,11]]},"DOI":"10.1007\/s11042-019-08021-1","type":"journal-article","created":{"date-parts":[[2019,7,31]],"date-time":"2019-07-31T12:02:51Z","timestamp":1564574571000},"page":"32187-32237","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":16,"title":["Survey of deep learning and architectures for visual captioning\u2014transitioning between media and natural languages"],"prefix":"10.1007","volume":"78","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-1563-9304","authenticated-orcid":false,"given":"Chiranjib","family":"Sur","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,7,31]]},"reference":[{"key":"8021_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR, vol 3, p 6","DOI":"10.1109\/CVPR.2018.00636"},{"key":"8021_CR2","unstructured":"Baldi P (2012) Autoencoders, unsupervised learning, and deep architectures. In: ICML Unsupervised and Transfer Learning, vol 27, p 1"},{"key":"8021_CR3","doi-asserted-by":"crossref","unstructured":"Bayer J, Wierstra D, Togelius J, Schmidhuber J (2009) Evolving memory cell structures for sequence learning. In: International conference on artificial neural networks. Springer, Berlin, pp 755\u2013764","DOI":"10.1007\/978-3-642-04277-5_76"},{"issue":"4","key":"8021_CR4","doi-asserted-by":"publisher","first-page":"27","DOI":"10.1109\/45.329294","volume":"13","author":"G Bebis","year":"1994","unstructured":"Bebis G, Georgiopoulos M (1994) Feed-forward neural networks. IEEE Potentials 13(4):27\u201331","journal-title":"IEEE Potentials"},{"key":"8021_CR5","first-page":"1137","volume":"3","author":"Y Bengio","year":"2003","unstructured":"Bengio Y, Ducharme R, Vincent P, Jauvin C (2003) A neural probabilistic language model. J Mach Learn Res 3:1137\u20131155","journal-title":"J Mach Learn Res"},{"key":"8021_CR6","doi-asserted-by":"crossref","unstructured":"Bengio Y, Lamblin P, Popovici P, Larochelle H (2007) Greedy layer-wise training of deep networks. In: Advances in neural information processing systems 19. MIT Press, Cambridge","DOI":"10.7551\/mitpress\/7503.003.0024"},{"key":"8021_CR7","doi-asserted-by":"crossref","unstructured":"Bengio Y, Boulanger-Lewandowski N, Pascanu R (2013) Advances in optimizing recurrent networks. In: 2013 IEEE international conference on acoustics, speech and signal processing. IEEE, pp 8624\u20138628","DOI":"10.1109\/ICASSP.2013.6639349"},{"key":"8021_CR8","unstructured":"Bordes A, Weston J Learning end-to-end goal-oriented dialog. arXiv: 1605.07683"},{"key":"8021_CR9","unstructured":"Bordes A, Usunier N, Chopra S, Weston J Large-scale simple question answering with memory networks. arXiv: 1506:02075"},{"key":"8021_CR10","doi-asserted-by":"crossref","unstructured":"Chen X, Zitnick CL (2015) Mind\u2019s eye: a recurrent visual representation for image caption generation. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298856"},{"issue":"2","key":"8021_CR11","doi-asserted-by":"publisher","first-page":"302","DOI":"10.1109\/72.80341","volume":"2","author":"S Chen","year":"1991","unstructured":"Chen S, Cowan CF, Grant PM (1991) Orthogonal least squares learning algorithm for radial basis function networks. IEEE Trans Neural Netw 2(2):302\u2013309","journal-title":"IEEE Trans Neural Netw"},{"key":"8021_CR12","unstructured":"Chen W, Wilson JT, Tyree S, Weinberger KQ, Chen Y (2015) Compressing neural networks with the hashing trick. arXiv: 1504.04788"},{"key":"8021_CR13","doi-asserted-by":"crossref","unstructured":"Chen M, Ding G, Zhao S, Chen H, Liu Q, Han J (2017) Reference based LSTM for image captioning. In: AAAI, pp 3981\u20133987","DOI":"10.1609\/aaai.v31i1.11198"},{"key":"8021_CR14","doi-asserted-by":"crossref","unstructured":"Chen F, Ji R, Su J, Wu Y, Wu Y (2017) Structcap: structured semantic embedding for image captioning. In: Proceedings of the 2017 ACM on multimedia conference. ACM, pp 46\u201354","DOI":"10.1145\/3123266.3123275"},{"key":"8021_CR15","unstructured":"Chen H, Zhang H, Chen P Y, Yi J, Hsieh CJ (2017) Show-and-fool: crafting adversarial examples for neural image captioning. arXiv: 1712.02051"},{"key":"8021_CR16","doi-asserted-by":"crossref","unstructured":"Chen H, Ding G, Lin Z, Zhao S, Han J (2018) Show, observe and tell: attribute-driven attention model for image captioning. In: IJCAI, pp 606\u2013612","DOI":"10.24963\/ijcai.2018\/84"},{"key":"8021_CR17","doi-asserted-by":"crossref","unstructured":"Chen F, Ji R, Sun X, Wu Y, Su J (2018) GroupCap: group-based image captioning with structured relevance and diversity constraints. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1345\u20131353","DOI":"10.1109\/CVPR.2018.00146"},{"key":"8021_CR18","doi-asserted-by":"crossref","unstructured":"Chen T, Zhang Z, You Q, Fang C, Wang Z, Jin H, Luo J (2018) \u201cFactual\u201d or \u201cEmotional\u201d: stylized image captioning with adaptive learning and attention. arXiv: 1807.03871","DOI":"10.1007\/978-3-030-01249-6_32"},{"key":"8021_CR19","unstructured":"Cho Y, Saul LK (2009) Kernel methods for deep learning. In: Advances in neural information processing systems, pp 342\u2013350"},{"key":"8021_CR20","doi-asserted-by":"crossref","unstructured":"Cho K, Van Merri\u00ebnboer B, Bahdanau D, Bengio Y (2014) On the properties of neural machine translation: encoder-decoder approaches. arXiv: 1409.1259","DOI":"10.3115\/v1\/W14-4012"},{"key":"8021_CR21","doi-asserted-by":"crossref","unstructured":"Cho K, Van Merri\u00ebnboer B, Gulcehre C, Bahdanau D, Bougares F, Schwenk H, Bengio Y (2014) Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv: 1406.1078","DOI":"10.3115\/v1\/D14-1179"},{"key":"8021_CR22","unstructured":"Cohn-Gordon R, Goodman N, Potts C (2018) Pragmatically informative image captioning with character-level reference. arXiv: 1804.05417"},{"key":"8021_CR23","doi-asserted-by":"crossref","unstructured":"Collobert R, Weston J (2008) A unified architecture for natural language processing: deep neural networks with multitask learning. In: Proceedings of the 25th international conference on Machine learning. ACM, pp 160\u2013167","DOI":"10.1145\/1390156.1390177"},{"issue":"2","key":"8021_CR24","first-page":"48","volume":"14","author":"M Cornia","year":"2018","unstructured":"Cornia M, Baraldi L, Serra G, Cucchiara R (2018) Paying more attention to saliency: image captioning with saliency and context attention. ACM Trans Multimed Comput Commun Appl (TOMM) 14(2):48","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"8021_CR25","unstructured":"Courville AC, Bergstra J, Bengio Y (2011) A spike and slab restricted Boltzmann machine. In: AISTATS, vol 1, p 5"},{"key":"8021_CR26","doi-asserted-by":"crossref","unstructured":"Devlin J et al (2015) Language models for image captioning: the quirks and what works. arXiv: 1505.01809","DOI":"10.3115\/v1\/P15-2017"},{"issue":"1","key":"8021_CR27","first-page":"3035","volume":"16","author":"PS Dhillon","year":"2015","unstructured":"Dhillon PS, Foster DP, Ungar LH (2015) Eigenwords: spectral word embeddings. J Mach Learn Res 16(1):3035\u20133078","journal-title":"J Mach Learn Res"},{"key":"8021_CR28","unstructured":"Doersch C (2016) Tutorial on variational autoencoders. arXiv: 1606.05908"},{"key":"8021_CR29","doi-asserted-by":"crossref","unstructured":"Donahue J et al (2015) Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298878"},{"issue":"4","key":"8021_CR30","doi-asserted-by":"publisher","first-page":"1017","DOI":"10.1109\/TCYB.2016.2536638","volume":"47","author":"B Du","year":"2016","unstructured":"Du B, Xiong W, Wu J, Zhang L, Zhang L, Tao D (2016) Stacked convolutional denoising auto-encoders for feature representation. IEEE trans Cybern 47(4):1017\u20131027","journal-title":"IEEE trans Cybern"},{"key":"8021_CR31","doi-asserted-by":"crossref","unstructured":"Fang H et al (2015) From captions to visual concepts and back. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"8021_CR32","doi-asserted-by":"crossref","unstructured":"Farhadi A et al (2010) Every picture tells a story: Generating sentences from images. In: European conference on computer vision. Springer, Berlin","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"8021_CR33","unstructured":"Fu J et al (2016) Deep Q-networks for accelerating the training of deep neural networks. arXiv: 1606.01467"},{"issue":"12","key":"8021_CR34","doi-asserted-by":"publisher","first-page":"2321","DOI":"10.1109\/TPAMI.2016.2642953","volume":"39","author":"K Fu","year":"2017","unstructured":"Fu K, Jin J, Cui R, Sha F, Zhang C (2017) Aligning where to see and what to tell: image captioning with region-based attention and scene-specific contexts. IEEE Trans Pattern Anal Mach Intell 39(12):2321\u20132334","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"8021_CR35","doi-asserted-by":"crossref","unstructured":"Fu K, Li J, Jin J, Zhang C (2018) Image-text surgery: efficient concept learning in image captioning by generating pseudopairs. IEEE Trans Neural Netw Learn Syst 29.12(2018):5910\u20135921","DOI":"10.1109\/TNNLS.2018.2813306"},{"issue":"6","key":"8021_CR36","doi-asserted-by":"publisher","first-page":"801","DOI":"10.1016\/S0893-6080(05)80125-X","volume":"6","author":"KI Funahashi","year":"1993","unstructured":"Funahashi KI, Nakamura Y (1993) Approximation of dynamical systems by continuous time recurrent neural networks. Neural Netw 6(6):801\u2013806","journal-title":"Neural Netw"},{"key":"8021_CR37","unstructured":"Gan Z et al (2016) Semantic compositional networks for visual captioning. arXiv: 1611.08002"},{"key":"8021_CR38","doi-asserted-by":"crossref","unstructured":"Gan C et al (2017) Stylenet: generating attractive visual captions with styles. In: CVPR","DOI":"10.1109\/CVPR.2017.108"},{"key":"8021_CR39","doi-asserted-by":"crossref","unstructured":"Girshick R et al (2014) Rich feature hierarchies for accurate object detection and semantic segmentation. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2014.81"},{"key":"8021_CR40","unstructured":"Goldberg Y, Levy O (2014) word2vec explained: deriving Mikolov et al.\u2019s negative-sampling word-embedding method. arXiv: 1402.3722"},{"key":"8021_CR41","doi-asserted-by":"crossref","unstructured":"Graupe D (1997) Large scale memory storage and retrieval (LAMSTAR) network. In: Principles of artificial neural networks, pp 191\u2013222","DOI":"10.1142\/9789812385383_0013"},{"key":"8021_CR42","unstructured":"Graves A, Wayne G, Danihelka I (2014) Neural turing machines. arXiv: 1410.5401"},{"key":"8021_CR43","unstructured":"Han S, Mao H, Dally WJ (2015) Deep compression: compressing deep neural network with pruning, trained quantization and Huffman coding. arXiv: 1510.00149"},{"key":"8021_CR44","doi-asserted-by":"crossref","unstructured":"Harzig P, Brehm S, Lienhart R, Kaiser C, Schallner R (2018) Multimodal image captioning for marketing analysis. arXiv: 1802.01958","DOI":"10.1109\/MIPR.2018.00035"},{"key":"8021_CR45","unstructured":"He K, Zhang X, Ren S, Sun J (2015) Deep residual learning for image recognition. arXiv: 1512.03385"},{"issue":"1","key":"8021_CR46","doi-asserted-by":"publisher","first-page":"81","DOI":"10.1109\/36.124218","volume":"30","author":"PD Heermann","year":"1992","unstructured":"Heermann PD, Khazenie N (1992) Classification of multispectral remote sensing data using a back-propagation neural network. IEEE Trans Geosci Remote Sens 30 (1):81\u201388","journal-title":"IEEE Trans Geosci Remote Sens"},{"key":"8021_CR47","doi-asserted-by":"crossref","unstructured":"Hendricks LA et al (2016) Deep compositional captioning: describing novel object categories without paired training data. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.8"},{"key":"8021_CR48","doi-asserted-by":"publisher","first-page":"31","DOI":"10.3389\/neuro.09.031.2009","volume":"3","author":"S Herculano-Houzel","year":"2009","unstructured":"Herculano-Houzel S (2009) The human brain in numbers: a linearly scaled-up primate brain. Front Hum Neurosci 3:31","journal-title":"Front Hum Neurosci"},{"key":"8021_CR49","unstructured":"Hinton GE (1986) Learning distributed representations of concepts. In: Proceedings of the eighth annual conference of the cognitive science society, vol 1, p 12"},{"key":"8021_CR50","doi-asserted-by":"publisher","first-page":"1527","DOI":"10.1162\/neco.2006.18.7.1527","volume":"18","author":"GE Hinton","year":"2006","unstructured":"Hinton GE, Osindero S, Teh YW (2006) A fast learning algorithm for deep belief nets. Neural Comput 18:1527\u20131554","journal-title":"Neural Comput"},{"issue":"8","key":"8021_CR51","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Comput 9(8):1735\u20131780","journal-title":"Neural Comput"},{"key":"8021_CR52","doi-asserted-by":"publisher","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh M, Young P, Hockenmaier J (2013) Framing image description as a ranking task: data, models and evaluation metrics. J Artif Intell Res 47:853\u2013899","journal-title":"J Artif Intell Res"},{"issue":"8","key":"8021_CR53","doi-asserted-by":"publisher","first-page":"2554","DOI":"10.1073\/pnas.79.8.2554","volume":"79","author":"JJ Hopfield","year":"1982","unstructured":"Hopfield JJ (1982) Neural networks and physical systems with emergent collective computational abilities. Proc Natl Acad Sci 79(8):2554\u20132558","journal-title":"Proc Natl Acad Sci"},{"key":"8021_CR54","unstructured":"Huang EH, Socher R, Manning CD, Ng AY (2012) Improving word representations via global context and multiple word prototypes. In: Proceedings of the 50th annual meeting of the association for computational linguistics: long papers, vol 1. Association for Computational Linguistics, pp 873\u2013882"},{"issue":"3","key":"8021_CR55","doi-asserted-by":"publisher","first-page":"574","DOI":"10.1113\/jphysiol.1959.sp006308","volume":"148","author":"DH Hubel","year":"1959","unstructured":"Hubel DH, Wiesel TN (1959) Receptive fields of single neurones in the cat\u2019s striate cortex. J Physiol 148(3):574\u2013591","journal-title":"J Physiol"},{"issue":"8","key":"8021_CR56","doi-asserted-by":"publisher","first-page":"1944","DOI":"10.1109\/TPAMI.2012.268","volume":"35","author":"B Hutchinson","year":"2013","unstructured":"Hutchinson B, Deng L, Yu D (2013) Tensor deep stacking networks. IEEE Trans Pattern Anal Mach Intell 35(8):1944\u20131957","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"8021_CR57","unstructured":"Irsoy O, Cardie C (2014) Deep recursive neural networks for compositionality in language. In: Advances in neural information processing systems, pp 2096\u20132104"},{"key":"8021_CR58","doi-asserted-by":"crossref","unstructured":"Iyyer M, Manjunatha V, Boyd-Graber J, Daum\u00e9 H III (2015) Deep unordered composition rivals syntactic methods for text classification. In: Proceedings of the association for computational linguistics","DOI":"10.3115\/v1\/P15-1162"},{"issue":"5","key":"8021_CR59","doi-asserted-by":"publisher","first-page":"1063","DOI":"10.1109\/TNN.2004.832719","volume":"15","author":"EM Izhikevich","year":"2004","unstructured":"Izhikevich EM (2004) Which model to use for cortical spiking neurons? IEEE Trans Neural Netw 15(5):1063\u20131070","journal-title":"IEEE Trans Neural Netw"},{"key":"8021_CR60","doi-asserted-by":"crossref","unstructured":"Jia X et al (2015) Guiding the long-short term memory model for image caption generation. In: Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.277"},{"key":"8021_CR61","doi-asserted-by":"crossref","unstructured":"Jiang W, Ma L, Chen X, Zhang H, Liu W (2018) Learning to guide decoding for image captioning. arXiv: 1804.00887","DOI":"10.1609\/aaai.v32i1.12283"},{"key":"8021_CR62","unstructured":"Jin J et al (2015) Aligning where to see and what to tell: image caption with region-based attention and scene factorization. arXiv: 1506.06272"},{"key":"8021_CR63","doi-asserted-by":"crossref","unstructured":"Karpathy A, Li F-F (2015) Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"8021_CR64","unstructured":"Karpathy A, Joulin A, Li FFF (2014) Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in neural information processing systems"},{"key":"8021_CR65","doi-asserted-by":"crossref","unstructured":"Karpathy A, Toderici G, Shetty S, Leung T, Sukthankar R, Fei-Fei L (2014) Large-scale video classification with convolutional neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1725\u20131732","DOI":"10.1109\/CVPR.2014.223"},{"issue":"6","key":"8021_CR66","doi-asserted-by":"publisher","first-page":"398","DOI":"10.1049\/iet-cvi.2016.0286","volume":"11","author":"M Kilickaya","year":"2017","unstructured":"Kilickaya M, Akkus BK, Cakici R, Erdem A, Erdem E, Ikizler-Cinbis N (2017) Data-driven image captioning via salient region discovery. IET Comput Vis 11(6):398\u2013406","journal-title":"IET Comput Vis"},{"key":"8021_CR67","unstructured":"Kiros R, Salakhutdinov R, Zemel RS (2014) Unifying visual-semantic embeddings with multimodal neural language models. arXiv: 1411.2539"},{"key":"8021_CR68","unstructured":"Kiros R, Zemel R, Salakhutdinov R (2014) A multiplicative model for learning distributed text-based attribute representations. In: Advances in neural information processing systems"},{"key":"8021_CR69","doi-asserted-by":"crossref","unstructured":"Kohonen T (1995) Learning vector quantization. In: Self-organizing maps. Springer, Berlin, pp 175\u2013189","DOI":"10.1007\/978-3-642-97610-0_6"},{"issue":"1","key":"8021_CR70","doi-asserted-by":"publisher","first-page":"19","DOI":"10.1016\/S0925-2312(98)00031-9","volume":"21","author":"T Kohonen","year":"1998","unstructured":"Kohonen T, Somervuo P (1998) Self-organizing maps of symbol strings. Neurocomputing 21(1):19\u201330","journal-title":"Neurocomputing"},{"issue":"1","key":"8021_CR71","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R Krishna","year":"2017","unstructured":"Krishna R et al (2017) Visual genome: connecting language and vision using crowdsourced dense image annotations. Int J Comput Vis 123(1):32\u201373","journal-title":"Int J Comput Vis"},{"key":"8021_CR72","unstructured":"Krizhevsky A, Sutskever I, Hinton G E (2012) Imagenet classification with deep convolutional neural networks. In: Advances in neural information processing systems, pp 1097\u20131105"},{"issue":"12","key":"8021_CR73","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni G et al (2013) Babytalk: understanding and generating simple image descriptions. IEEE Trans Pattern Anal Mach Intell 35(12):2891\u20132903","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"8021_CR74","unstructured":"Kumar A, Irsoy O, Su J, Bradbury J, English R, Pierce B, ..., Socher R (2015) Ask me anything: dynamic memory networks for natural language processing. arXiv: 1506.07285"},{"issue":"10","key":"8021_CR75","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1162\/tacl_a_00188","volume":"2","author":"P Kuznetsova","year":"2014","unstructured":"Kuznetsova P et al (2014) TREETALK: composition and compression of trees for image descriptions. TACL 2(10):351\u2013362","journal-title":"TACL"},{"issue":"2\u20133","key":"8021_CR76","doi-asserted-by":"publisher","first-page":"259","DOI":"10.1080\/01638539809545028","volume":"25","author":"TK Landauer","year":"1998","unstructured":"Landauer TK, Foltz PW, Laham D (1998) An introduction to latent semantic analysis. Discourse Process 25(2\u20133):259\u2013284","journal-title":"Discourse Process"},{"key":"8021_CR77","doi-asserted-by":"crossref","unstructured":"Lee H, Grosse R, Ranganath R, Ng AY (2009) Convolutional deep belief networks for scalable unsupervised learning of hierarchical representations. In: Proceedings of the 26th annual international conference on machine learning. ACM, pp 609\u2013616","DOI":"10.1145\/1553374.1553453"},{"key":"8021_CR78","doi-asserted-by":"crossref","unstructured":"Levy O, Goldberg Y (2014) Dependency-based word embeddings. In: ACL (2), pp 302\u2013308","DOI":"10.3115\/v1\/P14-2050"},{"key":"8021_CR79","unstructured":"Li S et al (2011) Composing simple image descriptions using web-scale n-grams. In: Proceedings of the fifteenth conference on computational natural language learning. Association for Computational Linguistics"},{"key":"8021_CR80","unstructured":"Li X, Wang X, Xu C, Lan W, Wei Q, Yang G, Xu J (2018) COCO-CN for cross-lingual image tagging, captioning and retrieval. arXiv: 1805.08661"},{"key":"8021_CR81","unstructured":"Lin Y, Tong Z, Zhu S, Yu K (2010) Deep coding network. In: Advances in neural information processing systems, pp 1405\u20131413"},{"key":"8021_CR82","doi-asserted-by":"crossref","unstructured":"Liu C, Mao J, Sha F, Yuille A L (2017) Attention correctness in neural image captioning. In: AAAI, pp 4176\u20134182","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"8021_CR83","doi-asserted-by":"crossref","unstructured":"Liu C, Sun F, Wang C, Wang F, Yuille A (2017) MAT: a multimodal attentive translator for image captioning. arXiv: 1702.05658","DOI":"10.24963\/ijcai.2017\/563"},{"key":"8021_CR84","doi-asserted-by":"crossref","unstructured":"Liu S, Zhu Z, Ye N, Guadarrama S, Murphy K (2017) Improved image captioning via policy gradient optimization of spider. In: Proceedings IEEE international conference on computer vision, vol 3, p 3","DOI":"10.1109\/ICCV.2017.100"},{"key":"8021_CR85","doi-asserted-by":"crossref","unstructured":"Liu X, Li H, Shao J, Chen D, Wang X (2018) Show, tell and discriminate: image captioning by self-retrieval with partially labeled data. arXiv: 1803.08314","DOI":"10.1007\/978-3-030-01267-0_21"},{"issue":"7","key":"8021_CR86","doi-asserted-by":"publisher","first-page":"1201","DOI":"10.1016\/0893-6080(95)00061-5","volume":"8","author":"SCB Lo","year":"1995","unstructured":"Lo SCB, Chan HP, Lin JS, Li H, Freedman M T, Mun S K (1995) Artificial convolution neural network for medical image pattern recognition. Neural Netw 8(7):1201\u20131214","journal-title":"Neural Netw"},{"key":"8021_CR87","unstructured":"Lotter W, Kreiman G, Cox D (2016) Deep predictive coding networks for video prediction and unsupervised learning. arXiv: 1301.1880"},{"key":"8021_CR88","doi-asserted-by":"crossref","unstructured":"Lu J, Xiong C, Parikh D, Socher R (2017) Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR), vol 6, p 2","DOI":"10.1109\/CVPR.2017.345"},{"key":"8021_CR89","doi-asserted-by":"crossref","unstructured":"Lu D, Whitehead S, Huang L, Ji H, Chang S F (2018) Entity-aware image caption generation. arXiv: 1804.07889","DOI":"10.18653\/v1\/D18-1435"},{"key":"8021_CR90","doi-asserted-by":"crossref","unstructured":"Lu J, Yang J, Batra D, Parikh D (2018) Neural baby talk. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 7219\u20137228","DOI":"10.1109\/CVPR.2018.00754"},{"key":"8021_CR91","unstructured":"Luong T, Socher R, Manning CD (2013) Better word representations with recursive neural networks for morphology. In: CoNLL, pp 104\u2013113"},{"key":"8021_CR92","unstructured":"Mao J et al (2014) Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv: 1412.6632"},{"key":"8021_CR93","doi-asserted-by":"crossref","unstructured":"Mao J et al (2015) Learning like a child: fast novel visual concept learning from sentence descriptions of images. In: Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.291"},{"key":"8021_CR94","doi-asserted-by":"crossref","unstructured":"Mathews AP, Xie L, He X (2016) SentiCap: generating image descriptions with sentiments. In: AAAI","DOI":"10.1609\/aaai.v30i1.10475"},{"key":"8021_CR95","unstructured":"Melnyk I, Sercu T, Dognin PL, Ross J, Mroueh Y (2018) Improved image captioning with adversarial semantic alignment. arXiv: 1805.00063"},{"key":"8021_CR96","doi-asserted-by":"crossref","unstructured":"Memisevic R, Hinton G (2007) Unsupervised learning of image transformations. In: IEEE conference on computer vision and pattern recognition, 2007. CVPR\u201907. IEEE","DOI":"10.1109\/CVPR.2007.383036"},{"key":"8021_CR97","doi-asserted-by":"crossref","unstructured":"Mikolov T, Karafi\u00e1t M, Burget L, Cernock\u00fd J, Khudanpur S (2010) Recurrent neural network based language model. In: Interspeech, vol 2, p 3","DOI":"10.21437\/Interspeech.2010-343"},{"key":"8021_CR98","unstructured":"Mitchell M et al (2012) Midge: generating image descriptions from computer vision detections. In: Proceedings of the 13th conference of the european chapter of the association for computational linguistics. Association for Computational Linguistics"},{"key":"8021_CR99","unstructured":"Mnih A, Hinton G E (2009) A scalable hierarchical distributed language model. In: Advances in neural information processing systems, pp 1081\u20131088"},{"key":"8021_CR100","unstructured":"Mnih A, Kavukcuoglu K (2013) Learning word embeddings efficiently with noise-contrastive estimation. In: Advances in neural information processing systems, pp 2265\u20132273"},{"key":"8021_CR101","unstructured":"Mnih V, Kavukcuoglu K, Silver D, Graves A, Antonoglou I, Wierstra D, Riedmiller M (2013) Playing atari with deep reinforcement learning. arXiv: 1312.5602"},{"issue":"7540","key":"8021_CR102","doi-asserted-by":"publisher","first-page":"529","DOI":"10.1038\/nature14236","volume":"518","author":"V Mnih","year":"2015","unstructured":"Mnih V, Kavukcuoglu K, Silver D, Rusu AA, Veness J, Bellemare MG, ..., Petersen S (2015) Human-level control through deep reinforcement learning. Nature 518(7540):529\u2013533","journal-title":"Nature"},{"key":"8021_CR103","doi-asserted-by":"crossref","unstructured":"Nakkiran P, Alvarez R, Prabhavalkar R, Parada C (2015) Compressing deep neural networks using a rank-constrained topology","DOI":"10.21437\/Interspeech.2015-351"},{"issue":"8","key":"8021_CR104","doi-asserted-by":"publisher","first-page":"2107","DOI":"10.1109\/TKDE.2015.2399298","volume":"27","author":"L Nie","year":"2015","unstructured":"Nie L, Wang M, Zhang L, Yan S, Zhang B, Chua T S (2015) Disease inference from health-related questions via sparse deep learning. IEEE Trans Knowl Data Eng 27(8):2107\u20132119","journal-title":"IEEE Trans Knowl Data Eng"},{"key":"8021_CR105","unstructured":"Ordonez V, Kulkarni G, Berg TL (2011) Im2text: describing images using 1 million captioned photographs. In: Advances in neural information processing systems"},{"key":"8021_CR106","doi-asserted-by":"crossref","unstructured":"Park CC, Kim B, Kim G (2017) Attend to you: personalized image captioning with context sequence memory networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 895\u2013903","DOI":"10.1109\/CVPR.2017.681"},{"key":"8021_CR107","unstructured":"Park CC, Kim B, Kim G (2018) Towards personalized image captioning via multimodal memory networks. IEEE Trans Pattern Anal Mach Intell 41.4(2018):999\u20131012"},{"key":"8021_CR108","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: global vectors for word representation. In: EMNLP, vol 14, pp 1532\u201343","DOI":"10.3115\/v1\/D14-1162"},{"key":"8021_CR109","doi-asserted-by":"crossref","unstructured":"Perozzi B, Al-Rfou R, Skiena S (2014) Deepwalk: online learning of social representations. In: Proceedings of the 20th ACM SIGKDD international conference on knowledge discovery and data mining. ACM, pp 701\u2013710","DOI":"10.1145\/2623330.2623732"},{"key":"8021_CR110","unstructured":"Pfister T, Simonyan K, Charles J, Zisserman A (2014) Deep convolutional neural networks for efficient pose estimation in gesture videos. In: Asian conference on computer vision. Springer International Publishing, pp 538\u2013552"},{"key":"8021_CR111","unstructured":"Pu Y et al (2016) Variational autoencoder for deep learning of images, labels and captions. In: Advances in neural information processing systems"},{"key":"8021_CR112","unstructured":"Ren S, He K, Girshick R, Sun J (2015) Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in neural information processing systems, pp 91\u201399"},{"key":"8021_CR113","doi-asserted-by":"crossref","unstructured":"Ren Z, Wang X, Zhang N, Lv X, Li LJ (2017) Deep reinforcement learning-based image captioning with embedding reward. arXiv: 1704.03899","DOI":"10.1109\/CVPR.2017.128"},{"key":"8021_CR114","doi-asserted-by":"crossref","unstructured":"Rennie SJ, Marcheret E, Mroueh Y, Ross J, Goel V (2017) Self-critical sequence training for image captioning. In: CVPR, vol 1, p 3","DOI":"10.1109\/CVPR.2017.131"},{"issue":"7","key":"8021_CR115","doi-asserted-by":"publisher","first-page":"969","DOI":"10.1016\/j.ijar.2008.11.006","volume":"50","author":"R Salakhutdinov","year":"2009","unstructured":"Salakhutdinov R, Hinton G (2009) Semantic hashing. Int J Approx Reason 50(7):969\u2013978","journal-title":"Int J Approx Reason"},{"key":"8021_CR116","unstructured":"Salakhutdinov R, Hinton GE (2009) Deep Boltzmann Machines. In: AISTATS, vol 1, p 3"},{"issue":"8","key":"8021_CR117","doi-asserted-by":"publisher","first-page":"1958","DOI":"10.1109\/TPAMI.2012.269","volume":"35","author":"R Salakhutdinov","year":"2013","unstructured":"Salakhutdinov R, Tenenbaum JB, Torralba A (2013) Learning with hierarchical-deep models. IEEE Trans Pattern Anal Mach Intell 35(8):1958\u20131971","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"issue":"2","key":"8021_CR118","doi-asserted-by":"publisher","first-page":"234","DOI":"10.1162\/neco.1992.4.2.234","volume":"4","author":"J Schmidhuber","year":"1992","unstructured":"Schmidhuber J (1992) Learning complex, extended sequences using the principle of history compression. Neural Comput 4(2):234\u2013242","journal-title":"Neural Comput"},{"key":"8021_CR119","doi-asserted-by":"crossref","unstructured":"Sharma P, Ding N, Goodman S, Soricut R (2018) Conceptual captions: a cleaned, hypernymed, image alt-text dataset for automatic image captioning. In: Proceedings of the 56th annual meeting of the association for computational linguistics (volume 1: long papers), vol 1, pp 2556\u20132565","DOI":"10.18653\/v1\/P18-1238"},{"issue":"4","key":"8021_CR120","doi-asserted-by":"publisher","first-page":"371","DOI":"10.1016\/S0098-1354(96)00281-5","volume":"21","author":"AM Shaw","year":"1997","unstructured":"Shaw AM, Doyle FJ, Schwaber JS (1997) A dynamic neural network approach to nonlinear process modeling. Comput Chem Eng 21(4):371\u2013385","journal-title":"Comput Chem Eng"},{"key":"8021_CR121","unstructured":"Simonyan K, Zisserman A (2014) Two-stream convolutional networks for action recognition in videos. In: Advances in neural information processing systems, pp 568\u2013576"},{"key":"8021_CR122","unstructured":"Simonyan K, Vedaldi A, Zisserman A (2013) Deep fisher networks for large-scale image classification. In: Advances in neural information processing systems, pp 163\u2013171"},{"key":"8021_CR123","unstructured":"Socher R, Lin CC, Manning C, Ng AY (2011) Parsing natural scenes and natural language with recursive neural networks. In: Proceedings of the 28th international conference on machine learning (ICML-11), pp 129\u2013136"},{"key":"8021_CR124","doi-asserted-by":"crossref","unstructured":"Socher R et al (2014) Grounded compositional semantics for finding and describing images with sentences, vol 2, pp 207\u2013218","DOI":"10.1162\/tacl_a_00177"},{"key":"8021_CR125","unstructured":"Srivastava N, Salakhutdinov R R (2012) Multimodal learning with deep Boltzmann machines. In: Advances in neural information processing systems, pp 2222\u20132230"},{"issue":"1","key":"8021_CR126","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava N, Hinton GE, Krizhevsky A, Sutskever I, Salakhutdinov R (2014) Dropout: a simple way to prevent neural networks from overfitting. J Mach Learn Res 15(1):1929\u20131958","journal-title":"J Mach Learn Res"},{"key":"8021_CR127","unstructured":"Strobl EV, Visweswaran S (2013) Deep multiple kernel learning. In: 2013 12th international conference on machine learning and applications (ICMLA), vol 1. IEEE, pp 414\u2013417"},{"key":"8021_CR128","unstructured":"Sukhbaatar S, Szlam A, Weston J, Fergus R End-To-End memory networks. NIPS 2015 (and arXiv: 1503.08895 )"},{"key":"8021_CR129","unstructured":"Sur C (2018) DeepSeq: learning browsing log data based personalized security vulnerabilities and counter intelligent measures. J Ambient Intell Humaniz Comput (2018):1\u201330"},{"key":"8021_CR130","doi-asserted-by":"crossref","unstructured":"Sur C (2018) Ensemble one-vs-all learning technique with emphatic & rehearsal training for phishing email classification using psychology. J Exp Theor Artif Intell 30.6(2018):733\u2013762","DOI":"10.1080\/0952813X.2018.1467496"},{"key":"8021_CR131","unstructured":"Sutskever I, Hinton GE, Taylor GW (2009) The recurrent temporal restricted boltzmann machine. In: Advances in neural information processing systems, pp 1601\u20131608"},{"key":"8021_CR132","unstructured":"Sutskever I, Martens J, Hinton G (2011) Generating text with recurrent neural networks. In: Proceedings of the 28th international conference on machine learning (ICML-11)"},{"key":"8021_CR133","unstructured":"Sutskever I, Vinyals O, Le QV (2014) Sequence to sequence learning with neural networks. In: Advances in neural information processing systems"},{"key":"8021_CR134","doi-asserted-by":"crossref","unstructured":"Szegedy C, Liu W, Jia Y, Sermanet P, Reed S, Anguelov D, ..., Rabinovich A (2015) Going deeper with convolutions. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 1\u20139","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"8021_CR135","doi-asserted-by":"crossref","unstructured":"Tang J, Qu M, Wang M, Zhang M, Yan J, Mei Q (2015) Line: large-scale information network embedding. In: Proceedings of the 24th international conference on World Wide Web. ACM, pp 1067\u20131077","DOI":"10.1145\/2736277.2741093"},{"key":"8021_CR136","doi-asserted-by":"crossref","unstructured":"Tavakoliy HR, Shetty R, Borji A, Laaksonen J (2017) Paying attention to descriptions generated by image captioning models. In: 2017 IEEE international conference on computer vision (ICCV). IEEE, pp 2506\u20132515","DOI":"10.1109\/ICCV.2017.272"},{"key":"8021_CR137","unstructured":"Torralba A, Tenenbaum JB, Salakhutdinov RR (2011) Learning to learn with compound hd models. In: Advances in neural information processing systems, pp 2061\u20132069"},{"key":"8021_CR138","doi-asserted-by":"crossref","unstructured":"Tran D et al (2015) Learning spatiotemporal features with 3d convolutional networks. In: Proceedings of the IEEE international conference on computer vision","DOI":"10.1109\/ICCV.2015.510"},{"key":"8021_CR139","doi-asserted-by":"crossref","unstructured":"Tran K et al (2016) Rich image captioning in the wild. In: Proceedings of the IEEE conference on computer vision and pattern recognition workshops","DOI":"10.1109\/CVPRW.2016.61"},{"key":"8021_CR140","unstructured":"Turian J, Ratinov L, Bengio Y (2010) Word representations: a simple and general method for semi-supervised learning. In: Proceedings of the 48th annual meeting of the association for computational linguistics. Association for Computational Linguistics, pp 384\u2013394"},{"key":"8021_CR141","unstructured":"Tymoshenko K, Bonadiman D, Moschitti A (2016) Convolutional neural networks vs. convolution kernels: feature engineering for answer sentence reranking. In: Proceedings of NAACL-HLT, pp 1268\u20131278"},{"key":"8021_CR142","first-page":"3371","volume":"11","author":"P Vincent","year":"2010","unstructured":"Vincent P, Larochelle H, Lajoie I, Bengio Y, Manzagol PA (2010) Stacked denoising autoencoders: learning useful representations in a deep network with a local denoising criterion. J Mach Learn Res 11:3371\u20133408","journal-title":"J Mach Learn Res"},{"key":"8021_CR143","doi-asserted-by":"crossref","unstructured":"Vinyals O et al (2015) Show and tell: a neural image caption generator. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"8021_CR144","unstructured":"Vinyals O, Fortunato M, Jaitly N (2015) Pointer networks. In: Advances in neural information processing systems, pp 2692\u20132700"},{"issue":"4","key":"8021_CR145","doi-asserted-by":"publisher","first-page":"652","DOI":"10.1109\/TPAMI.2016.2587640","volume":"39","author":"O Vinyals","year":"2017","unstructured":"Vinyals O, Toshev A, Bengio S, Erhan D (2017) Show and tell: lessons learned from the 2015 mscoco image captioning challenge. IEEE Trans Pattern Anal Mach Intell 39(4):652\u2013663","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"8021_CR146","unstructured":"Wang Z, de Freitas N, Lanctot M (2015) Dueling network architectures for deep reinforcement learning. arXiv: 1511.06581"},{"key":"8021_CR147","doi-asserted-by":"crossref","unstructured":"Wang Y, Lin Z, Shen X, Cohen S, Cottrell G W (2017) Skeleton key: image captioning by skeleton-attribute decomposition. arXiv: 1704.06972","DOI":"10.1109\/CVPR.2017.780"},{"issue":"2s","key":"8021_CR148","first-page":"40","volume":"14","author":"C Wang","year":"2018","unstructured":"Wang C, Yang H, Meinel C (2018) Image captioning with deep bidirectional LSTMs and multi-task learning. ACM Trans Multimed Comput Commun Appl (TOMM) 14(2s):40","journal-title":"ACM Trans Multimed Comput Commun Appl (TOMM)"},{"key":"8021_CR149","unstructured":"Weston J, Chopra S, Bordes A Memory networks. ICLR 2015 (and arXiv: 1410.3916 )"},{"key":"8021_CR150","unstructured":"Weston J, Bordes A, Chopra S, Rush AM, van Merri\u00ebnboer B, Joulin A, Mikolov T (2015) Towards ai-complete question answering: a set of prerequisite toy tasks. arXiv: 1502.05698"},{"key":"8021_CR151","doi-asserted-by":"crossref","unstructured":"Wu Q et al (2016) What value do explicit high level concepts have in vision to language problems?. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.29"},{"key":"8021_CR152","unstructured":"Wu Q, Shen C, Wang P, Dick A, van den Hengel A (2017) Image captioning and visual question answering based on attributes and external knowledge. IEEE Trans Pattern Anal Mach Intell 40.6(2017):1367\u20131381"},{"key":"8021_CR153","unstructured":"Wu J, Hu Z, Mooney RJ (2018) Joint image captioning and question answering. arXiv: 1805.08389"},{"key":"8021_CR154","doi-asserted-by":"crossref","unstructured":"Wu C, Wei Y, Chu X, Su F, Wang L (2018) Modeling visual and word-conditional semantic attention for image captioning. Signal Process Image Commun 67(2018):100\u2013107","DOI":"10.1016\/j.image.2018.06.002"},{"key":"8021_CR155","unstructured":"Xu K et al (2015) Show, attend and tell: neural image caption generation with visual attention. In: International conference on machine learning"},{"key":"8021_CR156","unstructured":"Yang Y et al (2011) Corpus-guided sentence generation of natural images. In: Proceedings of the conference on empirical methods in natural language processing. Association for Computational Linguistics"},{"key":"8021_CR157","unstructured":"Yang Z et al (2016) Review networks for caption generation. In: Advances in neural information processing systems"},{"key":"8021_CR158","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Mei T (2017) Incorporating copying mechanism in image captioning for learning novel objects. In: 2017 IEEE conference on computer vision and pattern recognition (CVPR). IEEE, pp 5263\u20135271","DOI":"10.1109\/CVPR.2017.559"},{"key":"8021_CR159","doi-asserted-by":"crossref","unstructured":"Yao T, Pan Y, Li Y, Qiu Z, Mei T (2017) Boosting image captioning with attributes. In: IEEE international conference on computer vision, ICCV, pp 22\u201329","DOI":"10.1109\/ICCV.2017.524"},{"key":"8021_CR160","doi-asserted-by":"crossref","unstructured":"Ye S, Liu N, Han J (2018) Attentive linear transformation for image captioning. IEEE Trans Image Process 27.11(2018):5514\u20135524","DOI":"10.1109\/TIP.2018.2855406"},{"key":"8021_CR161","doi-asserted-by":"crossref","unstructured":"You Q et al (2016) Image captioning with semantic attention. In: Proceedings of the IEEE conference on computer vision and pattern recognition","DOI":"10.1109\/CVPR.2016.503"},{"key":"8021_CR162","unstructured":"You Q, Jin H, Luo J (2018) Image captioning at will: a versatile scheme for effectively injecting sentiments into image descriptions. arXiv: 1801.10121"},{"key":"8021_CR163","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young P et al (2014) From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics 2:67\u201378","journal-title":"Transactions of the Association for Computational Linguistics"},{"issue":"2","key":"8021_CR164","doi-asserted-by":"publisher","first-page":"388","DOI":"10.1109\/TASL.2012.2227738","volume":"21","author":"D Yu","year":"2013","unstructured":"Yu D, Deng L, Seide F (2013) The deep tensor neural network with applications to large vocabulary speech recognition. IEEE Trans Audio Speech Lang Process 21 (2):388\u2013396","journal-title":"IEEE Trans Audio Speech Lang Process"},{"key":"8021_CR165","unstructured":"Zhang X, LeCun Y (2015) Text understanding from scratch. arXiv: 1502.01710"},{"key":"8021_CR166","unstructured":"Zhang X, Zhao J, LeCun Y (2015) Character-level convolutional networks for text classification. In: Advances in neural information processing systems, pp 649\u2013657"},{"key":"8021_CR167","unstructured":"Zhang L, Sung F, Liu F, Xiang T, Gong S, Yang Y, Hospedales T M (2017) Actor-critic sequence training for image captioning. arXiv: 1706.09601"},{"key":"8021_CR168","unstructured":"Zhang Y-D et al (2017) Image based fruit category classification by 13-layer deep convolutional neural network and data augmentation. Multimed Tools Appl 78.3(2019):3613\u20133632"},{"issue":"17","key":"8021_CR169","doi-asserted-by":"publisher","first-page":"22821","DOI":"10.1007\/s11042-018-5765-3","volume":"77","author":"Y-D Zhang","year":"2018","unstructured":"Zhang Y-D, Muhammad K, Tang C (2018) Twelve-layer deep convolutional neural network with stochastic pooling for tea category classification on GPU platform. Multimed Tools Appl 77(17):22821\u201322839","journal-title":"Multimed Tools Appl"},{"key":"8021_CR170","unstructured":"Zhang M, Yang Y, Zhang H, Ji Y, Shen H T, Chua T S (2018) More is better: precise and detailed image captioning using online positive recall and missing concepts mining. IEEE Trans Image Process 28.1(2018):32\u201344"},{"key":"8021_CR171","doi-asserted-by":"crossref","unstructured":"Zhao W, Wang B, Ye J, Yang M, Zhao Z, Luo R, Qiao Y (2018) A multi-task learning approach for image captioning. In: IJCAI, pp 1205\u20131211","DOI":"10.24963\/ijcai.2018\/168"},{"key":"8021_CR172","unstructured":"Zhuang J, Tsang IW, Hoi SC (2011) Two-layer multiple kernel learning. In: AISTATS, pp 909\u2013917"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-019-08021-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11042-019-08021-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-019-08021-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,9,18]],"date-time":"2023-09-18T18:31:14Z","timestamp":1695061874000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11042-019-08021-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019,7,31]]},"references-count":172,"journal-issue":{"issue":"22","published-print":{"date-parts":[[2019,11]]}},"alternative-id":["8021"],"URL":"https:\/\/doi.org\/10.1007\/s11042-019-08021-1","relation":{},"ISSN":["1380-7501","1573-7721"],"issn-type":[{"value":"1380-7501","type":"print"},{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2019,7,31]]},"assertion":[{"value":"14 August 2018","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"18 May 2019","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 July 2019","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"31 July 2019","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}}]}}