{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,6,4]],"date-time":"2026-06-04T09:31:56Z","timestamp":1780565516023,"version":"3.54.1"},"publisher-location":"Berlin, Heidelberg","reference-count":26,"publisher":"Springer Berlin Heidelberg","isbn-type":[{"value":"9783642155604","type":"print"},{"value":"9783642155611","type":"electronic"}],"license":[{"start":{"date-parts":[[2010,1,1]],"date-time":"2010-01-01T00:00:00Z","timestamp":1262304000000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2010]]},"DOI":"10.1007\/978-3-642-15561-1_2","type":"book-chapter","created":{"date-parts":[[2010,9,5]],"date-time":"2010-09-05T11:42:57Z","timestamp":1283686977000},"page":"15-29","source":"Crossref","is-referenced-by-count":586,"title":["Every Picture Tells a Story: Generating Sentences from Images"],"prefix":"10.1007","author":[{"given":"Ali","family":"Farhadi","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohsen","family":"Hejrati","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Mohammad Amin","family":"Sadeghi","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Peter","family":"Young","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Cyrus","family":"Rashtchian","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Julia","family":"Hockenmaier","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"David","family":"Forsyth","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","reference":[{"key":"2_CR1","doi-asserted-by":"crossref","unstructured":"Barnard, K., Duygulu, P., Forsyth, D.: Clustering art. In: CVPR, vol.\u00a0II, pp. 434\u2013441 (2001)","DOI":"10.1109\/CVPR.2001.990994"},{"key":"2_CR2","unstructured":"Mori, Y., Takahashi, H., Oka, R.: Image-to-word transformation based on dividing and vector quantizing images with words. In: WMISR (1999)"},{"key":"2_CR3","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"97","DOI":"10.1007\/3-540-47979-1_7","volume-title":"Computer Vision - ECCV 2002","author":"P. Duygulu","year":"2002","unstructured":"Duygulu, P., Barnard, K., de Freitas, N., Forsyth, D.: Object recognition as machine translation. In: Heyden, A., Sparr, G., Nielsen, M., Johansen, P. (eds.) ECCV 2002. LNCS, vol.\u00a02353, pp. 97\u2013112. Springer, Heidelberg (2002)"},{"key":"2_CR4","doi-asserted-by":"crossref","unstructured":"Datta, R., Li, J., Wang, J.Z.: Content-based image retrieval: approaches and trends of the new age. In: MIR 2005, pp. 253\u2013262 (2005)","DOI":"10.1145\/1101826.1101866"},{"key":"2_CR5","doi-asserted-by":"crossref","unstructured":"Forsyth, D., Berg, T., Alm, C., Farhadi, A., Hockenmaier, J., Loeff, N., Wang, G.: Words and pictures: Categories, modifiers, depiction and iconography. In: Object Categorization: Computer and Human Vision Perspectives, CUP (2009)","DOI":"10.1017\/CBO9780511635465.010"},{"key":"2_CR6","doi-asserted-by":"crossref","unstructured":"Phillips, P.J., Newton, E.: Meta-analysis of face recognition algorithms. In: ICAFGR (2002)","DOI":"10.6028\/NIST.IR.6719"},{"key":"2_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1007\/978-3-540-88682-2_3","volume-title":"Computer Vision \u2013 ECCV 2008","author":"A. Gupta","year":"2008","unstructured":"Gupta, A., Davis, L.: Beyond nouns: Exploiting prepositions and comparative adjectives for learning visual classifiers. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008, Part I. LNCS, vol.\u00a05302, pp. 16\u201329. Springer, Heidelberg (2008)"},{"key":"2_CR8","doi-asserted-by":"crossref","unstructured":"Li, L.J., Fei-Fei, L.: What, where and who? classifying event by scene and object recognition. In: ICCV (2007)","DOI":"10.1109\/ICCV.2007.4408872"},{"key":"2_CR9","doi-asserted-by":"crossref","unstructured":"Li, L.J., Socher, R., Fei-Fei, L.: Towards total scene understanding:classification, annotation and segmentation in an automatic framework. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206718"},{"key":"2_CR10","doi-asserted-by":"crossref","unstructured":"Gupta, A., Davis, L.: Objects in action: An approach for combining action understanding and object perception. In: CVPR (2007)","DOI":"10.1109\/CVPR.2007.383331"},{"key":"2_CR11","doi-asserted-by":"crossref","unstructured":"Gupta, A., Davis, A.K.,, L.: Observing human-object interactions: Using spatial and functional compatibility for recognition. Trans. on PAMI (2009)","DOI":"10.1109\/TPAMI.2009.83"},{"key":"2_CR12","doi-asserted-by":"crossref","unstructured":"Yao, B., Fei-Fei, L.: Modeling mutual context of object and human pose in human-object interaction activities. In: CVPR (2010)","DOI":"10.1109\/CVPR.2010.5540235"},{"key":"2_CR13","unstructured":"Berg, T.L., Berg, A.C., Edwards, J., Forsyth, D.A.: Who\u2019s in the picture. In: Advances in Neural Information Processing (2004)"},{"key":"2_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"86","DOI":"10.1007\/978-3-540-88688-4_7","volume-title":"Computer Vision \u2013 ECCV 2008","author":"T. Mensink","year":"2008","unstructured":"Mensink, T., Verbeek, J.: Improving people search using query expansions: How friends help to find people. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008, Part II. LNCS, vol.\u00a05303, pp. 86\u201399. Springer, Heidelberg (2008)"},{"key":"2_CR15","unstructured":"Luo, J., Caputo, B., Ferrari, V.: Who\u2019s doing what: Joint modeling of names and verbs for simultaneous face and pose annotation. In: NIPS (2009)"},{"key":"2_CR16","doi-asserted-by":"crossref","unstructured":"Coyne, B., Sproat, R.: Wordseye: an automatic text-to-scene conversion system. In: SIGGRAPH 2001 (2001)","DOI":"10.1145\/383259.383316"},{"key":"2_CR17","doi-asserted-by":"crossref","unstructured":"Gupta, A., Srinivasan, P., Shi, J., Davis, L.: Understanding videos, constructing plots: Learning a visually grounded storyline model from annotated videos. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206492"},{"key":"2_CR18","doi-asserted-by":"crossref","unstructured":"Yao, B.Z., Yang, X., Lin, L., Lee, M.W., Zhu, S.C.: I2t: Image parsing to text description. Proc. IEEE (2010) (in Press)","DOI":"10.1109\/JPROC.2010.2050411"},{"key":"2_CR19","doi-asserted-by":"crossref","unstructured":"Felzenszwalb, P., Mcallester, D., Ramanan, D.: A discriminatively trained, multiscale, deformable part model. In: CVPR 2008 (2008)","DOI":"10.1109\/CVPR.2008.4587597"},{"key":"2_CR20","unstructured":"Hoiem, D., Divvala, S., Hays, J.: Pascal voc 2009 challenge. In: PASCAL challenge workshop in ECCV (2009)"},{"key":"2_CR21","doi-asserted-by":"crossref","unstructured":"Oliva, A., Torralba, A.: Building the gist of a scene: the role of global image features in recognition. In: Progress in Brain Research, p. 2006 (2006)","DOI":"10.1016\/S0079-6123(06)55002-2"},{"key":"2_CR22","doi-asserted-by":"crossref","unstructured":"Curran, J., Clark, S., Bos, J.: Linguistically motivated large-scale nlp with c&c and boxer. In: ACL, pp. 33\u201336","DOI":"10.3115\/1557769.1557781"},{"key":"2_CR23","unstructured":"Lin, D.: An information-theoretic definition of similarity. In: ICML, 296\u2013304 (1998)"},{"key":"2_CR24","doi-asserted-by":"crossref","unstructured":"Taskar, B., Chatalbashev, V., Koller, D., Guestrin, C.: Learning structured prediction models: a large margin approach. In: ICML, pp. 896\u2013903 (2005)","DOI":"10.1145\/1102351.1102464"},{"key":"2_CR25","unstructured":"Ratliff, N., Bagnell, J.A., Zinkevich, M.: Subgradient methods for maximum margin structured learning. In: ICML (2006)"},{"key":"2_CR26","unstructured":"Rashtchian, C., Young, P., Hodosh, M., Hockenmaier, J.: Collecting image annotations using amazon\u2019s mechanical turk. In: NAACL HLT 2010 Workshop on Creating Speech and Language Data with Amazon\u2019s Mechanical Turk (2010)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2010"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-642-15561-1_2","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,2,25]],"date-time":"2025-02-25T15:31:47Z","timestamp":1740497507000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-642-15561-1_2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2010]]},"ISBN":["9783642155604","9783642155611"],"references-count":26,"URL":"https:\/\/doi.org\/10.1007\/978-3-642-15561-1_2","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2010]]}}}