{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,2,27]],"date-time":"2026-02-27T05:09:25Z","timestamp":1772168965365,"version":"3.50.1"},"reference-count":53,"publisher":"IEEE","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016,6]]},"DOI":"10.1109\/cvpr.2016.13","type":"proceedings-article","created":{"date-parts":[[2016,12,12]],"date-time":"2016-12-12T20:38:49Z","timestamp":1481575129000},"page":"49-58","source":"Crossref","is-referenced-by-count":569,"title":["Learning Deep Representations of Fine-Grained Visual Descriptions"],"prefix":"10.1109","author":[{"given":"Scott","family":"Reed","sequence":"first","affiliation":[]},{"given":"Zeynep","family":"Akata","sequence":"additional","affiliation":[]},{"given":"Honglak","family":"Lee","sequence":"additional","affiliation":[]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[]}],"member":"263","reference":[{"key":"ref39","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-015-0816-y"},{"key":"ref38","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995627"},{"key":"ref33","author":"norouzi","year":"2013","journal-title":"Zero-shot learning by convex combination of semantic embeddings"},{"key":"ref32","doi-asserted-by":"publisher","DOI":"10.1109\/ICVGIP.2008.47"},{"key":"ref31","article-title":"Multimodal deep learning","author":"ngiam","year":"2011","journal-title":"ICML"},{"key":"ref30","article-title":"Beyond short snippets: Deep networks for video classification","author":"ng","year":"2015","journal-title":"CVPR"},{"key":"ref37","doi-asserted-by":"publisher","DOI":"10.3115\/v1\/D14-1162"},{"key":"ref36","article-title":"Zero-shot learning with semantic output codes","author":"palatucci","year":"2009","journal-title":"NIPS"},{"key":"ref35","article-title":"Im2Text: Describing images using 1 million captioned photographs","author":"ordonez","year":"2011","journal-title":"NIPS"},{"key":"ref34","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.222"},{"key":"ref28","article-title":"Distributed representations of words and phrases and their compositionality","author":"mikolov","year":"2013","journal-title":"NIPS"},{"key":"ref27","author":"metz","year":"2015","journal-title":"Facebooks ai can caption photos for the blind on its own"},{"key":"ref29","first-page":"39","volume":"38","author":"miller","year":"1995","journal-title":"Wordnet A Lexical Database for English"},{"key":"ref2","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298911"},{"key":"ref1","article-title":"Label-embedding for image classification","author":"akata","year":"2015","journal-title":"IEEE TPAMI"},{"key":"ref20","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"ref22","article-title":"ImageNet classification with deep convolutional neural networks","author":"krizhevsky","year":"2012","journal-title":"NIPS"},{"key":"ref21","article-title":"Ranking and retrieval of image sequences from multiple paragraph queries","author":"kim","year":"2015","journal-title":"CVPR"},{"key":"ref24","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2013.140"},{"key":"ref23","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"ref26","article-title":"Deep captioning with multimodal recurrent neural networks (M-RNN)","author":"mao","year":"2015","journal-title":"ICLRE"},{"key":"ref25","article-title":"Microsoft COCO: Common objects in context","author":"lin","year":"2014","journal-title":"ECCV"},{"key":"ref50","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","article-title":"From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions","volume":"2","author":"young","year":"2014","journal-title":"TACL"},{"key":"ref51","article-title":"Part-based R-CNNs for fine-grained category detection","author":"zhang","year":"2014","journal-title":"ECCV"},{"key":"ref53","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.179"},{"key":"ref52","article-title":"Character-level convolutional networks for text classification","author":"zhang","year":"2015","journal-title":"NIPS"},{"key":"ref10","article-title":"Discovering localized attributes for fine-grained recognition","author":"duan","year":"2012","journal-title":"CVPR"},{"key":"ref11","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2013.321"},{"key":"ref40","article-title":"Very deep convolutional networks for large-scale image recognition","author":"simonyan","year":"2015","journal-title":"ICLRE"},{"key":"ref12","article-title":"Devise: A deep visual-semantic embedding model","author":"frome","year":"2013","journal-title":"NIPS"},{"key":"ref13","article-title":"Transductive multi-view embedding for zero-shot recognition and annotation","author":"fu","year":"2014","journal-title":"ECCV"},{"key":"ref14","doi-asserted-by":"publisher","DOI":"10.1109\/TPAMI.2015.2408354"},{"key":"ref15","doi-asserted-by":"publisher","DOI":"10.1145\/2578726.2578746"},{"key":"ref16","doi-asserted-by":"publisher","DOI":"10.1080\/00437956.1954.11659520"},{"key":"ref17","doi-asserted-by":"publisher","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"ref18","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298638"},{"key":"ref19","article-title":"Batch normalization: Accelerating deep network training by reducing internal covariate shift","author":"ioffe","year":"2015","journal-title":"ICML"},{"key":"ref4","article-title":"Label embedding trees for large multi-class tasks","author":"bengio","year":"2010","journal-title":"NIPS"},{"key":"ref3","doi-asserted-by":"publisher","DOI":"10.1109\/ICCV.2015.483"},{"key":"ref6","article-title":"ImageNet: A large-scale hierarchical image database","author":"deng","year":"2009","journal-title":"CVPR"},{"key":"ref5","article-title":"Zero-shot video retrieval using content and concepts","author":"dalton","year":"2013","journal-title":"CIKM"},{"key":"ref8","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"ref7","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2013.81"},{"key":"ref49","article-title":"Show, attend and tell: Neural image caption generation with visual attention","author":"xu","year":"2015","journal-title":"ICML"},{"key":"ref9","article-title":"Decaf: A deep convolutional activation feature for generic visual recognition","author":"donahue","year":"2014","journal-title":"ICML"},{"key":"ref46","article-title":"Caltech-Ucds Birds 200","author":"welinder","year":"2010","journal-title":"Technical Report CNS-TR-2010-001"},{"key":"ref45","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"ref48","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2014.341"},{"key":"ref47","doi-asserted-by":"publisher","DOI":"10.1007\/s10994-010-5198-3"},{"key":"ref42","article-title":"Improved multimodal deep learning with variation of information","author":"sohn","year":"2014","journal-title":"NIPS"},{"key":"ref41","article-title":"Zero-shot learning through cross-modal transfer","author":"socher","year":"2013","journal-title":"NIPS"},{"key":"ref44","doi-asserted-by":"publisher","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"ref43","first-page":"2949","volume":"15","author":"srivastava","year":"2014","journal-title":"Multimodal learning with deep boltzmann machines"}],"event":{"name":"2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","location":"Las Vegas, NV, USA","start":{"date-parts":[[2016,6,27]]},"end":{"date-parts":[[2016,6,30]]}},"container-title":["2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)"],"original-title":[],"link":[{"URL":"http:\/\/xplorestaging.ieee.org\/ielx7\/7776647\/7780329\/07780382.pdf?arnumber=7780382","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,16]],"date-time":"2022-07-16T20:05:52Z","timestamp":1658001952000},"score":1,"resource":{"primary":{"URL":"http:\/\/ieeexplore.ieee.org\/document\/7780382\/"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016,6]]},"references-count":53,"URL":"https:\/\/doi.org\/10.1109\/cvpr.2016.13","relation":{},"subject":[],"published":{"date-parts":[[2016,6]]}}}