{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,21]],"date-time":"2025-10-21T15:27:16Z","timestamp":1761060436779,"version":"3.37.3"},"reference-count":47,"publisher":"Springer Science and Business Media LLC","issue":"2-3","license":[{"start":{"date-parts":[[2017,10,14]],"date-time":"2017-10-14T00:00:00Z","timestamp":1507939200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2017,10,14]],"date-time":"2017-10-14T00:00:00Z","timestamp":1507939200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Inf Retrieval J"],"published-print":{"date-parts":[[2018,6]]},"DOI":"10.1007\/s10791-017-9318-6","type":"journal-article","created":{"date-parts":[[2017,10,14]],"date-time":"2017-10-14T14:56:57Z","timestamp":1507993017000},"page":"208-229","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":17,"title":["Picture it in your mind: generating high level visual representations from textual descriptions"],"prefix":"10.1007","volume":"21","author":[{"given":"Fabio","family":"Carrara","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0002-5725-4322","authenticated-orcid":false,"given":"Andrea","family":"Esuli","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tiziano","family":"Fagni","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fabrizio","family":"Falchi","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Alejandro","family":"Moreo Fern\u00e1ndez","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,10,14]]},"reference":[{"key":"9318_CR1","doi-asserted-by":"crossref","unstructured":"Bai, Y., Yu, W., Xiao, T., Xu, C., Yang, K., Ma, W.-Y., & Zhao, T. (2014). Bag-of-words based deep neural network for image retrieval. In Proceedings of the ACM international conference on multimedia (pp. 229\u2013232). ACM.","DOI":"10.1145\/2647868.2656402"},{"key":"9318_CR2","doi-asserted-by":"crossref","unstructured":"Cappallo, S., Mensink, T., & Snoek, C. G. (2015). Image2emoji: Zero-shot emoji prediction for visual media. In Proceedings of the 23rd ACM international conference on multimedia, MM \u201915 (pp. 1311\u20131314). New York, NY: ACM.","DOI":"10.1145\/2733373.2806335"},{"key":"9318_CR3","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., & Zitnick, C. L. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv preprint arXiv:1504.00325."},{"key":"9318_CR4","doi-asserted-by":"crossref","unstructured":"Cheng, H.-T., Koc, L., Harmsen, J., Shaked, T., Chandra, T., Aradhye, H., Anderson, G., Corrado, G., Chai, W., Ispir, M., et\u00a0al. (2016). Wide & deep learning for recommender systems. In Proceedings of the 1st workshop on deep learning for recommender systems (pp. 7\u201310). ACM.","DOI":"10.1145\/2988450.2988454"},{"key":"9318_CR5","doi-asserted-by":"crossref","unstructured":"Cho, K., Van\u00a0Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., & Bengio, Y. (2014). Learning phrase representations using RNN encoder\u2013decoder for statistical machine translation. arXiv preprint arXiv:1406.1078.","DOI":"10.3115\/v1\/D14-1179"},{"issue":"3","key":"9318_CR6","doi-asserted-by":"publisher","first-page":"287","DOI":"10.1016\/0165-1684(94)90029-9","volume":"36","author":"P Comon","year":"1994","unstructured":"Comon, P. (1994). Independent component analysis, a new concept? Signal Processing, 36(3), 287\u2013314.","journal-title":"Signal Processing"},{"issue":"3","key":"9318_CR7","doi-asserted-by":"publisher","first-page":"521","DOI":"10.1109\/TPAMI.2013.142","volume":"36","author":"J Costa Pereira","year":"2014","unstructured":"Costa Pereira, J., Coviello, E., Doyle, G., Rasiwasia, N., Lanckriet, G. R., Levy, R., et al. (2014). On the role of correlation and abstraction in cross-modal multimedia retrieval. IEEE Transactions on Pattern Analysis and Machine Intelligence, 36(3), 521\u2013535.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"9318_CR8","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne\u00a0Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., & Darrell, T. (2015). Long-term recurrent convolutional networks for visual recognition and description. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2625\u20132634).","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"9318_CR9","unstructured":"Donahue, J., Jia, Y., Vinyals, O., Hoffman, J., Zhang, N., Tzeng, E., & Darrell, T. (2013). Decaf: A deep convolutional activation feature for generic visual recognition. arXiv preprint arXiv:1310.1531"},{"key":"9318_CR10","unstructured":"Dong, J., Li, X., & Snoek, C.\u00a0G.\u00a0M. (2016). Word2VisualVec: Cross-media retrieval by visual feature prediction. arXiv preprint arXiv:1604.06838"},{"key":"9318_CR11","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R. K., Deng, L., Doll\u00e1r, P., Gao, J., He, X., Mitchell, M., Platt, J. C., et\u00a0al. (2015). From captions to visual concepts and back. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1473\u20131482).","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"9318_CR12","doi-asserted-by":"crossref","unstructured":"Feng, F., Wang, X., & Li, R. (2014). Cross-modal retrieval with correspondence autoencoder. In Proceedings of the ACM international conference on multimedia (pp. 7\u201316). ACM.","DOI":"10.1145\/2647868.2654902"},{"key":"9318_CR13","unstructured":"Frome, A., Corrado, G.S., Shlens, J., Bengio, S., Dean, J., Ranzato, M., et\u00a0al. (2013). Devise: A deep visual-semantic embedding model. In C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani & K. Q. Weinberger (Eds.), Proceedings of the 26th International Conference on Neural Information Processing Systems (NIPS\u201913), (pp. 2121\u20132129). USA: Curran Associates Inc."},{"key":"9318_CR14","doi-asserted-by":"crossref","unstructured":"Gong, Y., Wang, L., Guo, R., & Lazebnik, S. (2014). Multi-scale orderless pooling of deep convolutional activation features. In European conference on computer vision (pp. 392\u2013407). Springer.","DOI":"10.1007\/978-3-319-10584-0_26"},{"key":"9318_CR15","doi-asserted-by":"crossref","unstructured":"Gong, Y., Wang, L., Hodosh, M., Hockenmaier, J., & Lazebnik, S. (2014). Improving image-sentence embeddings using large weakly annotated photo collections. In Computer vision\u2013ECCV 2014 (pp. 529\u2013545). Springer.","DOI":"10.1007\/978-3-319-10593-2_35"},{"key":"9318_CR16","first-page":"241","volume-title":"Deep image retrieval: Learning global representations for image search","author":"A Gordo","year":"2016","unstructured":"Gordo, A., Almaz\u00e1n, J., Revaud, J., & Larlus, D. (2016). Deep image retrieval: Learning global representations for image search (pp. 241\u2013257). Cham: Springer International Publishing."},{"key":"9318_CR17","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015). Deep residual learning for image recognition. arXiv preprint arXiv:1512.03385.","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"9318_CR18","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"9318_CR19","doi-asserted-by":"crossref","unstructured":"Hua, X.-S., Yang, L., Wang, J., Wang, J., Ye, M., Wang, K., Rui, Y., & Li, J. (2013). Clickage: Towards bridging semantic and intent gaps via mining click logs of search engines. In Proceedings of the 21st ACM international conference on multimedia (pp. 243\u2013252). ACM.","DOI":"10.1145\/2502081.2502283"},{"issue":"4","key":"9318_CR20","doi-asserted-by":"publisher","first-page":"422","DOI":"10.1145\/582415.582418","volume":"20","author":"K J\u00e4rvelin","year":"2002","unstructured":"J\u00e4rvelin, K., & Kek\u00e4l\u00e4inen, J. (2002). Cumulated gain-based evaluation of IR techniques. ACM Transactions on Information Systems (TOIS), 20(4), 422\u2013446.","journal-title":"ACM Transactions on Information Systems (TOIS)"},{"key":"9318_CR21","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., & Chum, O. (2012). Negative evidences and co-occurences in image retrieval: The benefit of PCA and whitening. In Computer vision\u2014ECCV 2012 (pp. 774\u2013787). Springer.","DOI":"10.1007\/978-3-642-33709-3_55"},{"key":"9318_CR22","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3128\u20133137).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"9318_CR23","unstructured":"Kingma, D., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980."},{"key":"9318_CR24","unstructured":"Kiros, R., Salakhutdinov, R., & Zemel, R. S. (2014). Unifying visual-semantic embeddings with multimodal neural language models. arXiv preprint arXiv:1411.2539."},{"key":"9318_CR25","unstructured":"Kiros, R., Zhu, Y., Salakhutdinov, R., Zemel, R. S., Torralba, A., Urtasun, R., & Fidler, S. (2015). Skip-thought vectors. In C. Cortes, D. D. Lee, M. Sugiyama & R. Garnett (Eds.), Proceedings of the 28th International Conference on Neural Information Processing Systems (NIPS\u201915), (pp. 3294\u20133302). Cambridge, MA, USA: MIT Press."},{"key":"9318_CR26","unstructured":"Klein, B., Lev, G., Sadeh, G., & Wolf, L. (2014). Fisher vectors derived from hybrid Gaussian\u2013Laplacian mixture models for image annotation. arXiv preprint arXiv:1411.7399."},{"key":"9318_CR27","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G.\u00a0E. (2012). Imagenet classification with deep convolutional neural networks. In F. Pereira, C. J. C. Burges, L. Bottou & K. Q. Weinberger (Eds.), Proceedings of the 25th International Conference on Neural Information Processing Systems (NIPS\u201912), (pp. 1097\u20131105). USA: Curran Associates Inc."},{"key":"9318_CR28","unstructured":"Lin, C.-Y. (2004). Rouge: A package for automatic evaluation of summaries. In Marie-Francine\u00a0Moens, S.\u00a0S. (Ed.), Text summarization branches out: Proceedings of the ACL-04 workshop, Barcelona, Spain, July 2004 (pp. 74\u201381). Association for Computational Linguistics."},{"key":"9318_CR29","doi-asserted-by":"crossref","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In Computer vision\u2014ECCV 2014 (pp. 740\u2013755). Springer.","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"9318_CR30","doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., Shang, L., & Li, H. (2015). Multimodal convolutional neural networks for matching image and sentence. In Proceedings of the IEEE international conference on computer vision (pp. 2623\u20132631).","DOI":"10.1109\/ICCV.2015.301"},{"key":"9318_CR31","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., & Yuille, A. (2014). Deep captioning with multimodal recurrent neural networks (m-rnn). arXiv preprint arXiv:1412.6632."},{"key":"9318_CR32","first-page":"414","volume-title":"Computing information retrieval performance measures efficiently in the presence of tied scores","author":"F McSherry","year":"2008","unstructured":"McSherry, F., & Najork, M. (2008). Computing information retrieval performance measures efficiently in the presence of tied scores (pp. 414\u2013421). Berlin: Springer."},{"key":"9318_CR33","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In C. J. C. Burges, L. Bottou, M. Welling, Z. Ghahramani & K. Q. Weinberger (Eds.), Proceedings of the 26th International\nConference on Neural Information Processing Systems (NIPS\u201913) (pp. 3111\u20133119). USA: Curran Associates Inc."},{"key":"9318_CR34","unstructured":"Ngiam, J., Khosla, A., Kim, M., Nam, J., Lee, H., & Ng, A.\u00a0Y. (2011). Multimodal deep learning. In Proceedings of the 28th international conference on machine learning (ICML-11) (pp. 689\u2013696)."},{"key":"9318_CR35","unstructured":"Norouzi, M., Mikolov, T., Bengio, S., Singer, Y., Shlens, J., Frome, A., Corrado, G. S., Dean, J. (2013). Zero-shot learning by convex combination of semantic embeddings. arXiv preprint arXiv:1312.5650."},{"key":"9318_CR36","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., & Manning, C. D. (2014). Glove: Global vectors for word representation. In EMNLP (Vol.\u00a014, pp. 1532\u20131543).","DOI":"10.3115\/v1\/D14-1162"},{"key":"9318_CR37","doi-asserted-by":"crossref","unstructured":"Razavian, A., Azizpour, H., Sullivan, J., & Carlsson, S. (2014). CNN features off-the-shelf: An astounding baseline for recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition workshops (pp. 806\u2013813).","DOI":"10.1109\/CVPRW.2014.131"},{"issue":"3","key":"9318_CR38","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., et al. (2015). Imagenet large scale visual recognition challenge. International Journal of Computer Vision, 115(3), 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"9318_CR39","doi-asserted-by":"crossref","unstructured":"Sak, H., Senior, A.\u00a0W., & Beaufays, F. (2014). Long short-term memory recurrent neural network architectures for large scale acoustic modeling. In INTERSPEECH (pp. 338\u2013342).","DOI":"10.21437\/Interspeech.2014-80"},{"key":"9318_CR40","unstructured":"Sharif, A., Hossein, R., Josephine, A., Stefan, S., Royal, K.\u00a0T.\u00a0H., Sharif Razavian, A., Azizpour, H., Sullivan, J., & Carlsson, S. (2014). CNN features off-the-shelf: An astounding baseline for recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition workshops."},{"key":"9318_CR41","unstructured":"Simonyan, K., & Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556."},{"key":"9318_CR42","doi-asserted-by":"crossref","unstructured":"Sundermeyer, M., Schl\u00fcter, R., & Ney, H. (2012). LSTM neural networks for language modeling. In Interspeech (pp. 194\u2013197).","DOI":"10.21437\/Interspeech.2012-65"},{"issue":"2","key":"9318_CR43","doi-asserted-by":"publisher","first-page":"64","DOI":"10.1145\/2812802","volume":"59","author":"B Thomee","year":"2016","unstructured":"Thomee, B., Shamma, D. A., Friedland, G., Elizalde, B., Ni, K., Poland, D., et al. (2016). Yfcc100m: The new data in multimedia research. Communications of the ACM, 59(2), 64\u201373.","journal-title":"Communications of the ACM"},{"key":"9318_CR44","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156\u20133164).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"9318_CR45","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., & Lazebnik, S. (2016). Learning deep structure-preserving image-text embeddings. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 5005\u20135013).","DOI":"10.1109\/CVPR.2016.541"},{"issue":"10","key":"9318_CR46","doi-asserted-by":"publisher","first-page":"1550","DOI":"10.1109\/5.58337","volume":"78","author":"PJ Werbos","year":"1990","unstructured":"Werbos, P. J. (1990). Backpropagation through time: What it does and how to do it. Proceedings of the IEEE, 78(10), 1550\u20131560.","journal-title":"Proceedings of the IEEE"},{"key":"9318_CR47","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., & Oliva, A. (2014). Learning deep features for scene recognition using places database. In Z. Ghahramani, M. Welling, C. Cortes, N. D. Lawrence & K. Q. Weinberger (Eds.), Proceedings of the 27th International Conference on Neural Information Processing Systems (NIPS\u201914), (pp. 487\u2013495). Cambridge, MA, USA: MIT Press."}],"container-title":["Information Retrieval Journal"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10791-017-9318-6\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-017-9318-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10791-017-9318-6.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,2]],"date-time":"2024-01-02T14:41:38Z","timestamp":1704206498000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10791-017-9318-6"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,10,14]]},"references-count":47,"journal-issue":{"issue":"2-3","published-print":{"date-parts":[[2018,6]]}},"alternative-id":["9318"],"URL":"https:\/\/doi.org\/10.1007\/s10791-017-9318-6","relation":{},"ISSN":["1386-4564","1573-7659"],"issn-type":[{"type":"print","value":"1386-4564"},{"type":"electronic","value":"1573-7659"}],"subject":[],"published":{"date-parts":[[2017,10,14]]},"assertion":[{"value":"5 November 2016","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"2 October 2017","order":2,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 October 2017","order":3,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}