{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,16]],"date-time":"2026-04-16T16:48:58Z","timestamp":1776358138268,"version":"3.51.2"},"reference-count":60,"publisher":"IEEE","license":[{"start":{"date-parts":[[2015,6,1]],"date-time":"2015-06-01T00:00:00Z","timestamp":1433116800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-029"},{"start":{"date-parts":[[2015,6,1]],"date-time":"2015-06-01T00:00:00Z","timestamp":1433116800000},"content-version":"stm-asf","delay-in-days":0,"URL":"https:\/\/doi.org\/10.15223\/policy-037"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015,6]]},"DOI":"10.1109\/cvpr.2015.7298932","type":"proceedings-article","created":{"date-parts":[[2015,10,15]],"date-time":"2015-10-15T18:42:06Z","timestamp":1444934526000},"page":"3128-3137","source":"Crossref","is-referenced-by-count":2998,"title":["Deep visual-semantic alignments for generating image descriptions"],"prefix":"10.1109","author":[{"given":"Andrej","family":"Karpathy","sequence":"first","affiliation":[{"name":"Department of Computer Science, Stanford University, USA"}]},{"given":"Li","family":"Fei-Fei","sequence":"additional","affiliation":[{"name":"Department of Computer Science, Stanford University, USA"}]}],"member":"263","reference":[{"key":"ref39","article-title":"A Joint Model of Language and Perception for Grounded Attribute Learning","author":"matuszek","year":"2012","journal-title":"Proceedings of the International Conference on Machine Learning 2012"},{"key":"ref38","article-title":"Explain images with multimodal recurrent neural networks","author":"mao","year":"2014","journal-title":"ar Xiv preprint arXiv 1410 1090"},{"key":"ref33","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2007.4408872"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/5.726791"},{"key":"ref31","doi-asserted-by":"crossref","first-page":"351","DOI":"10.1162\/tacl_a_00188","article-title":"Treetalk: Composition and compression of trees for image descriptions","volume":"2","author":"kuznetsova","year":"2014","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"ref30","article-title":"Collective generation of natural image descriptions","author":"kuznetsova","year":"2012","journal-title":"ACL"},{"key":"ref37","article-title":"Microsoft coco: Common objects in context","author":"lin","year":"2014","journal-title":"arXiv preprint arXiv 1405 0312"},{"key":"ref36","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.340"},{"key":"ref35","article-title":"Composing simple image descriptions using web-scale n-grams","author":"li","year":"2011","journal-title":"CoNLL"},{"key":"ref34","doi-asserted-by":"crossref","first-page":"2036","DOI":"10.1109\/CVPR.2009.5206718","article-title":"Towards total scene understanding: Classification, annotation and segmentation in an automatic framework","author":"li","year":"2009","journal-title":"Computer Vision and Pattern Recognition 2009 CVPR 2009 IEEE Conference on"},{"key":"ref60","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.211"},{"key":"ref28","article-title":"Imagenet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"NIPS"},{"key":"ref27","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.455"},{"key":"ref29","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"ref2","article-title":"Matching words and pictures","author":"barnard","year":"2003","journal-title":"JMLR"},{"key":"ref1","article-title":"Video in sentences out","author":"barbu","year":"2012","journal-title":"arXiv preprint arXiv 1204 2742"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref22","article-title":"Glove: Global vectors for word representation","author":"jeffreypennington","year":"2014"},{"key":"ref21","doi-asserted-by":"crossref","DOI":"10.1613\/jair.3994","article-title":"Framing image description as a ranking task: data, models and evaluation metrics","author":"hodosh","year":"2013","journal-title":"Journal of Artificial Intelligence Research"},{"key":"ref24","article-title":"Deep fragment em-beddings for bidirectional image sentence mapping","author":"karpathy","year":"2014","journal-title":"arXiv preprint arXiv 1406 5679"},{"key":"ref23","article-title":"Learning cross-modality similarity for multinomial data","author":"jia","year":"2011","journal-title":"ICCV"},{"key":"ref26","article-title":"Multimodal neural language models","author":"kiros","year":"2014","journal-title":"ICML"},{"key":"ref25","article-title":"Unifying visual-semantic embeddings with multimodal neural language models","author":"kiros","year":"2014","journal-title":"arXiv preprint arXiv 1411 2539"},{"key":"ref50","article-title":"Generating text with recurrent neural networks","author":"sutskever","year":"2011","journal-title":"ICML"},{"key":"ref51","article-title":"Going deeper with convolutions","author":"szegedy","year":"2014","journal-title":"arXiv preprint arXiv 1409 4842"},{"key":"ref59","article-title":"Recurrent neural network regularization","author":"zaremba","year":"2014","journal-title":"arXiv preprint arXiv 1409"},{"key":"ref58","doi-asserted-by":"crossref","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions","author":"young","year":"2014","journal-title":"TACL"},{"key":"ref57","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/S14-1015"},{"key":"ref56","doi-asserted-by":"publisher","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"ref55","article-title":"Corpus-guided sentence generation of natural images","author":"yang","year":"2011","journal-title":"EMNLP"},{"key":"ref54","article-title":"Show and tell: A neural image caption generator","author":"vinyals","year":"2014","journal-title":"arXiv preprint arXiv 1411 4555"},{"key":"ref53","article-title":"Cider: Consensus-based image description evaluation","author":"vedantam","year":"2014","journal-title":"CoRR abs\/1411 5726"},{"key":"ref52","article-title":"Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude","author":"tieleman","year":"2012"},{"key":"ref10","doi-asserted-by":"publisher","DOI":"10.1016\/0364-0213(90)90002-E"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-009-0275-4"},{"key":"ref40","doi-asserted-by":"crossref","DOI":"10.21437\/Interspeech.2010-343","article-title":"Recurrent neural network based language model","author":"mikolov","year":"2010","journal-title":"InterSpeech"},{"key":"ref12","article-title":"From captions to visual concepts and back","author":"fang","year":"2014","journal-title":"arXiv preprint arXiv 1411 4952"},{"key":"ref13","article-title":"Every picture tells a story: Generating sentences from images","author":"farhadi","year":"2010","journal-title":"ECCV"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1167\/7.1.10"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.260"},{"key":"ref16","article-title":"Devise: A deep visual-semantic embedding model","author":"frome","year":"2013","journal-title":"NIPS"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.81"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2009.5459211"},{"key":"ref19","doi-asserted-by":"crossref","DOI":"10.1007\/978-3-642-34500-5_24","article-title":"From image annotation to image description","author":"gupta","year":"2012","journal-title":"Neural Information Processing"},{"key":"ref4","article-title":"Microsoft coco captions: Data collection and evaluation server","author":"chen","year":"2015","journal-title":"arXiv preprint arXiv 1504 00325"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1007\/3-540-33486-6_6"},{"key":"ref6","article-title":"Imagenet: A large-scale hierarchical image database","author":"deng","year":"2009","journal-title":"CVPR"},{"key":"ref5","article-title":"Learning a recurrent visual representation for image caption generation","author":"chen","year":"2014","journal-title":"CoRR abs\/1411 5654"},{"key":"ref8","article-title":"Long-term recurrent convolutional networks for visual recognition and description","author":"donahue","year":"2014","journal-title":"arXiv preprint arXiv 1411 4389"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/W14-3348"},{"key":"ref49","doi-asserted-by":"crossref","DOI":"10.1162\/tacl_a_00177","article-title":"Grounded compositional semantics for finding and describing images with sentences","author":"socher","year":"2014","journal-title":"TACL"},{"key":"ref9","first-page":"1292","article-title":"Image description using visual dependency representations","author":"elliott","year":"2013","journal-title":"EMNLP"},{"key":"ref46","doi-asserted-by":"publisher","DOI":"10.1109\/78.650093"},{"key":"ref45","article-title":"Imagenet large scale visual recognition challenge","author":"russakovsky","year":"2014"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2010.5540112"},{"key":"ref47","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2014","journal-title":"arXiv preprint arXiv 1409 1556"},{"key":"ref42","article-title":"III. Midge: Generating image descriptions from computer vision detections","author":"mitchell","year":"2012","journal-title":"EACL"},{"key":"ref41","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"2013","journal-title":"NIPS"},{"key":"ref44","first-page":"311","article-title":"Bleu: a method for automatic evaluation of machine translation","author":"papineni","year":"2002","journal-title":"Proceedings of the 40th Annual Meeting on Association for Computational Linguistics - ACL '02"},{"key":"ref43","article-title":"Im2text: Describing images using 1 million captioned photographs","author":"ordonez","year":"2011","journal-title":"NIPS"}],"event":{"name":"2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Boston, MA, USA","start":{"date-parts":[[2015,6,7]]},"end":{"date-parts":[[2015,6,12]]}},"container-title":["2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7293313\/7298593\/07298932.pdf?arnumber=7298932","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,8,15]],"date-time":"2025-08-15T18:14:45Z","timestamp":1755281685000},"score":1,"resource":{"primary":{"URL":"https:\/\/ieeexplore.ieee.org\/document\/7298932\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015,6]]},"references-count":60,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2015.7298932","relation":{},"subject":[],"published":{"date-parts":[[2015,6]]}}}