{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,3]],"date-time":"2026-03-03T16:02:59Z","timestamp":1772553779865,"version":"3.50.1"},"publisher-location":"Cham","reference-count":41,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319105925","type":"print"},{"value":"9783319105932","type":"electronic"}],"license":[{"start":{"date-parts":[[2014,1,1]],"date-time":"2014-01-01T00:00:00Z","timestamp":1388534400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2014]]},"DOI":"10.1007\/978-3-319-10593-2_35","type":"book-chapter","created":{"date-parts":[[2014,8,14]],"date-time":"2014-08-14T02:52:23Z","timestamp":1407984743000},"page":"529-545","source":"Crossref","is-referenced-by-count":158,"title":["Improving Image-Sentence Embeddings Using Large Weakly Annotated Photo Collections"],"prefix":"10.1007","author":[{"given":"Yunchao","family":"Gong","sequence":"first","affiliation":[]},{"given":"Liwei","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Micah","family":"Hodosh","sequence":"additional","affiliation":[]},{"given":"Julia","family":"Hockenmaier","sequence":"additional","affiliation":[]},{"given":"Svetlana","family":"Lazebnik","sequence":"additional","affiliation":[]}],"member":"297","reference":[{"key":"35_CR1","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A. Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: Generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol.\u00a06314, pp. 15\u201329. Springer, Heidelberg (2010)"},{"key":"35_CR2","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., Li, S., Choi, Y., Berg, A.C., Berg, T.L.: Baby talk: Understanding and generating image descriptions. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"35_CR3","unstructured":"Li, S., Kulkarni, G., Berg, T.L., Berg, A.C., Choi, Y.: Composing simple image descriptions using web-scale n-grams. In: CoNLL (2011)"},{"key":"35_CR4","unstructured":"Mitchell, M., Han, X., Dodge, J., Mensch, A., Goyal, A., Berg, A., Yamaguchi, K., Berg, T., Stratos, K., Daum\u00e9, I.H.: Midge: Generating image descriptions from computer vision detections. In: EACL (2012)"},{"key":"35_CR5","doi-asserted-by":"crossref","unstructured":"Fidler, S., Sharma, A., Urtasun, R.: A sentence is worth a thousand pixels. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.260"},{"key":"35_CR6","doi-asserted-by":"crossref","unstructured":"Yao, B.Z., Yang, X., Lin, L., Lee, M.W., Zhu, S.C.: I2T: Image parsing to text description. Proceedings of the IEEE 98 (2010)","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"35_CR7","doi-asserted-by":"crossref","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: Data, models and evaluation metrics. Journal of Artificial Intelligence Research (2013)","DOI":"10.1613\/jair.3994"},{"key":"35_CR8","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2Text: Describing images using 1 million captioned photographs. In: NIPS (2011)"},{"key":"35_CR9","doi-asserted-by":"crossref","unstructured":"Socher, R., Le, Q.V., Manning, C.D., Ng, A.Y.: Grounded compositional semantics for finding and describing images with sentences. In: ACL (2013)","DOI":"10.1162\/tacl_a_00177"},{"key":"35_CR10","unstructured":"Kuznetsova, P., Ordonez, V., Berg, A.C., Berg, T.L., Choi, Y.: Collective generation of natural image descriptions. In: ACL (2012)"},{"key":"35_CR11","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"35_CR12","doi-asserted-by":"crossref","unstructured":"Hardoon, D., Szedmak, S., Shawe-Taylor, J.: Canonical correlation analysis; an overview with application to learning methods. Neural Computation 16 (2004)","DOI":"10.1162\/0899766042321814"},{"key":"35_CR13","doi-asserted-by":"crossref","unstructured":"Gong, Y., Ke, Q., Isard, M., Lazebnik, S.: A multi-view embedding space for modeling internet images, tags, and their semantics. IJCV (2013)","DOI":"10.1007\/s11263-013-0658-4"},{"key":"35_CR14","unstructured":"Gong, B., Grauman, K., Sha, F.: Connecting the dots with landmarks: Discriminatively learning domain-invariant features for unsupervised domain adaptation. In: ICML, pp. 222\u2013230 (2013)"},{"key":"35_CR15","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"213","DOI":"10.1007\/978-3-642-15561-1_16","volume-title":"Computer Vision \u2013 ECCV 2010","author":"K. Saenko","year":"2010","unstructured":"Saenko, K., Kulis, B., Fritz, M., Darrell, T.: Adapting visual category models to new domains. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol.\u00a06314, pp. 213\u2013226. Springer, Heidelberg (2010)"},{"key":"35_CR16","doi-asserted-by":"crossref","unstructured":"Shrivastava, A., Malisiewicz, T., Gupta, A., Efros, A.A.: Data-driven visual similarity for cross-domain image matching. ACM SIGGRAPH ASIA 30(6) (2011)","DOI":"10.1145\/2070781.2024188"},{"key":"35_CR17","doi-asserted-by":"crossref","unstructured":"Hays, J., Efros, A.A.: Scene completion using millions of photographs. ACM Transactions on Graphics (SIGGRAPH) 26(3) (2007)","DOI":"10.1145\/1276377.1276382"},{"key":"35_CR18","unstructured":"Guillaumin, M., Ferrari, V.: Large-scale knowledge transfer for object localization in imageNet. In: CVPR, 3202\u20133209 (2012)"},{"key":"35_CR19","doi-asserted-by":"crossref","unstructured":"Guillaumin, M., Verbeek, J., Schmid, C.: Multimodal semi-supervised learning for image classification. In: CVPR, 902\u2013909 (2010)","DOI":"10.1109\/CVPR.2010.5540120"},{"key":"35_CR20","doi-asserted-by":"crossref","unstructured":"Quattoni, A., Collins, M., Darrell, T.: Learning visual representations using images with captions. In: CVPR (2007)","DOI":"10.1109\/CVPR.2007.383173"},{"key":"35_CR21","doi-asserted-by":"crossref","unstructured":"Wang, G., Hoiem, D., Forsyth, D.: Building text features for object image classification. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206816"},{"key":"35_CR22","doi-asserted-by":"crossref","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. In: TACL (2014)","DOI":"10.1162\/tacl_a_00166"},{"key":"35_CR23","unstructured":"Oliva, A., Torralba, A.: Modeling the shape of the scene: a holistic representation of the spatial envelope. IJCV (2001)"},{"issue":"9","key":"35_CR24","doi-asserted-by":"publisher","first-page":"1582","DOI":"10.1109\/TPAMI.2009.154","volume":"32","author":"K.E.A. Sande van de","year":"2010","unstructured":"van de Sande, K.E.A., Gevers, T., Snoek, C.G.M.: Evaluating color descriptors for object and scene recognition. PAMI\u00a032(9), 1582\u20131596 (2010)","journal-title":"PAMI"},{"key":"35_CR25","unstructured":"Dalal, N., Triggs, B.: Histograms of oriented gradients for human detection. In: CVPR (2005)"},{"key":"35_CR26","doi-asserted-by":"crossref","unstructured":"J\u00e9gou, H., Douze, M., Schmid, C., Perez, P.: Aggregating local descriptors into a compact image representation. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540039"},{"key":"35_CR27","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: ImageNet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"35_CR28","unstructured":"Donahue, J., Jia, Y., Vinyals, O., Hoffman, J., Zhang, N., Tzeng, E., Darrell, T.: DeCAF: A deep convolutional activation feature for generic visual recognition. CoRR abs\/1310.1531 (2013)"},{"key":"35_CR29","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: A large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"35_CR30","doi-asserted-by":"crossref","unstructured":"Loper, E., Bird, S.: Nltk: The natural language toolkit. In: Proceedings of the ACL 2002 Workshop on Effective Tools and Methodologies for Teaching Natural Language Processing and Computational Linguistics, vol.\u00a01 (2002)","DOI":"10.3115\/1118108.1118117"},{"key":"35_CR31","unstructured":"Weston, J., Bengio, S., Usunier, N.: Wsabie: Scaling up to large vocabulary image annotation. In: IJCAI (2011)"},{"key":"35_CR32","unstructured":"Duchi, J., Hazan, E., Singer, Y.: Adaptive subgradient methods for online learning and stochastic optimization. JMLR (2011)"},{"key":"35_CR33","unstructured":"Zeiler, M.D.: ADADELTA: An adaptive learning rate method. arXiv preprint arXiv:1212.5701 (2012)"},{"key":"35_CR34","unstructured":"Socher, R., Ganjoo, M., Sridhar, H., Bastani, O., Manning, C.D., Ng, A.Y.: Zero-shot learning through cross-modal transfer. In: NIPS (2013)"},{"key":"35_CR35","doi-asserted-by":"publisher","first-page":"312","DOI":"10.1093\/biomet\/28.3-4.321","volume":"28","author":"H. Hotelling","year":"1936","unstructured":"Hotelling, H.: Relations between two sets of variables. Biometrika\u00a028, 312\u2013377 (1936)","journal-title":"Biometrika"},{"key":"35_CR36","doi-asserted-by":"crossref","unstructured":"Gordo, A., Rodr\u0131guez-Serrano, J.A., Perronnin, F., Valveny, E.: Leveraging category-level labels for instance-level image retrieval. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248035"},{"key":"35_CR37","doi-asserted-by":"crossref","unstructured":"Gopalan, R., Li, R., Chellappa, R.: Domain adaptation for object recognition: An unsupervised approach. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126344"},{"key":"35_CR38","doi-asserted-by":"crossref","unstructured":"Xu, Z., Chen, M., Weinberger, K.Q., Sha, F.: From sBoW to dCoT: Marginalized encoders for text representation. In: CIKM (2011)","DOI":"10.1145\/2396761.2398536"},{"key":"35_CR39","unstructured":"Rahimi, A., Recht, B.: Random features for large-scale kernel machines. In: NIPS (2007)"},{"key":"35_CR40","doi-asserted-by":"crossref","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.A.: Extracting and composing robust features with denoising autoencoders. In: ICML, pp. 1096\u20131103 (2008)","DOI":"10.1145\/1390156.1390294"},{"issue":"1","key":"35_CR41","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1561\/2200000006","volume":"2","author":"Y. Bengio","year":"2009","unstructured":"Bengio, Y.: Learning deep architectures for AI. Foundations and Trends in Machine Learning\u00a02(1), 1\u2013127 (2009)","journal-title":"Foundations and Trends in Machine Learning"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2014"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-10593-2_35","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,12,2]],"date-time":"2019-12-02T09:21:58Z","timestamp":1575278518000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-10593-2_35"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2014]]},"ISBN":["9783319105925","9783319105932"],"references-count":41,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-10593-2_35","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2014]]}}}