{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,2]],"date-time":"2026-04-02T23:37:55Z","timestamp":1775173075629,"version":"3.50.1"},"publisher-location":"Cham","reference-count":35,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319464749","type":"print"},{"value":"9783319464756","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46475-6_5","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T08:48:10Z","timestamp":1474015690000},"page":"69-85","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":639,"title":["Modeling Context in Referring Expressions"],"prefix":"10.1007","author":[{"given":"Licheng","family":"Yu","sequence":"first","affiliation":[]},{"given":"Patrick","family":"Poirson","sequence":"additional","affiliation":[]},{"given":"Shan","family":"Yang","sequence":"additional","affiliation":[]},{"given":"Alexander C.","family":"Berg","sequence":"additional","affiliation":[]},{"given":"Tamara L.","family":"Berg","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"issue":"4","key":"5_CR1","doi-asserted-by":"publisher","first-page":"592","DOI":"10.1016\/j.jml.2005.12.008","volume":"54","author":"S Brown-Schmidt","year":"2006","unstructured":"Brown-Schmidt, S., Tanenhaus, M.K.: Watching the eyes when talking about size: an investigation of message formulation and utterance planning. J. Mem. Lang. 54(4), 592\u2013609 (2006)","journal-title":"J. Mem. Lang."},{"key":"5_CR2","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: CVPR (2015)","DOI":"10.21236\/ADA623249"},{"key":"5_CR3","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R.K., Deng, L., Doll\u00e1r, P., Gao, J., He, X., Mitchell, M., Platt, J.C., et al.: From captions to visual concepts and back. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"5_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010). doi: 10.1007\/978-3-642-15561-1_2"},{"key":"5_CR5","doi-asserted-by":"crossref","unstructured":"FitzGerald, N., Artzi, Y., Zettlemoyer, L.S.: Learning distributions over logical forms for referring expression generation. In: EMNLP, pp. 1914\u20131925 (2013)","DOI":"10.18653\/v1\/D13-1197"},{"key":"5_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"51","DOI":"10.1007\/978-3-540-27823-8_6","volume-title":"Natural Language Generation","author":"K Funakoshi","year":"2004","unstructured":"Funakoshi, K., Watanabe, S., Kuriyama, N., Tokunaga, T.: Generating referring expressions using perceptual groups. In: Belz, A., Evans, R., Piwek, P. (eds.) INLG 2004. LNCS, vol. 3123, pp. 51\u201360. Springer, Heidelberg (2004)"},{"key":"5_CR7","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 1440\u20131448 (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"5_CR8","unstructured":"Greff, K., Srivastava, R.K., Koutn\u00edk, J., Steunebrink, B.R., Schmidhuber, J.: LSTM: a search space odyssey (2015). arXiv preprint arXiv:1503.04069"},{"key":"5_CR9","doi-asserted-by":"crossref","unstructured":"Grice, H.P.: Logic and conversation. In: Cole, P., Morgan, J.L. (eds.) Syntax and Semantics: Speech Acts, vol. 3, pp. 41\u201358. Academic Press, San Diego (1975)","DOI":"10.1163\/9789004368811_003"},{"key":"5_CR10","doi-asserted-by":"crossref","first-page":"853","DOI":"10.1613\/jair.3994","volume":"47","author":"M Hodosh","year":"2013","unstructured":"Hodosh, M., Young, P., Hockenmaier, J.: Framing image description as a ranking task: data, models and evaluation metrics. J. Artif. Intell. Res. 47, 853\u2013899 (2013)","journal-title":"J. Artif. Intell. Res."},{"key":"5_CR11","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"5_CR12","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: fully convolutional localization networks for dense captioning (2015). arXiv preprint arXiv:1511.07571"},{"key":"5_CR13","doi-asserted-by":"crossref","unstructured":"Jordan, P., Walker, M.: Learning attribute selections for non-pronominal expressions. In: ACL (2000)","DOI":"10.3115\/1075218.1075242"},{"key":"5_CR14","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"5_CR15","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.L.: ReferitGame: referring to objects in photographs of natural scenes. In: EMNLP, pp. 787\u2013798 (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"5_CR16","doi-asserted-by":"crossref","unstructured":"Kelleher, J.D., Kruijff, G.J.M.: Incremental generation of spatial referring expressions in situated dialog. In: ACL (2006)","DOI":"10.3115\/1220175.1220306"},{"key":"5_CR17","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. In: TACL (2015)"},{"issue":"1","key":"5_CR18","doi-asserted-by":"publisher","first-page":"173","DOI":"10.1162\/COLI_a_00088","volume":"38","author":"E Krahmer","year":"2012","unstructured":"Krahmer, E., Van Deemter, K.: Computational generation of referring expressions: a survey. Comput. Linguist. 38(1), 173\u2013218 (2012)","journal-title":"Comput. Linguist."},{"key":"5_CR19","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., Premraj, V., Ordonez, V., Dhar, S., Li, S., Choi, Y., Berg, A.C., Berg, T.: Babytalk: understanding and generating simple image descriptions. IEEE Trans. Pattern Anal. Mach. Intell. 35, 2891\u20132903 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"5_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Heidelberg (2014). doi: 10.1007\/978-3-319-10602-1_48"},{"key":"5_CR21","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"5_CR22","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., Yuille, A.: Deep captioning with multimodal recurrent neural networks (m-RNN). In: ICLR (2015)"},{"key":"5_CR23","unstructured":"Mitchell, M., van Deemter, K., Reiter, E.: Natural reference to objects in a visual domain. In: Proceedings of the 6th International Natural Language Generation Conference, pp. 95\u2013104. Association for Computational Linguistics (2010)"},{"key":"5_CR24","unstructured":"Mitchell, M., Reiter, E., van Deemter, K.: Typicality and object reference. Cognitive Science (CogSci) (2013)"},{"key":"5_CR25","unstructured":"Mitchell, M., Van Deemter, K., Reiter, E.: Generating expressions that refer to visible objects. In: HLT-NAACL, pp. 1174\u20131184 (2013)"},{"key":"5_CR26","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.L.: Im2Text: describing images using 1 million captioned photographs. In: Advances in Neural Information Processing Systems (2011)"},{"key":"5_CR27","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., Schiele, B.: Grounding of textual phrases in images by reconstruction (2015). arXiv preprint arXiv:1511.03745"},{"issue":"3","key":"5_CR28","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., et al.: Imagenet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"5_CR29","unstructured":"Sadeghi, F., Zitnick, C.L., Farhadi, A.: Visalogy: answering visual analogy questions. In: NIPS (2015)"},{"key":"5_CR30","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition (2014). arXiv preprint arXiv:1409.1556"},{"key":"5_CR31","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1162\/tacl_a_00177","volume":"2","author":"R Socher","year":"2014","unstructured":"Socher, R., Karpathy, A., Le, Q.V., Manning, C.D., Ng, A.Y.: Grounded compositional semantics for finding and describing images with sentences. Trans. Assoc. Comput. Linguist. 2, 207\u2013218 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"5_CR32","doi-asserted-by":"crossref","unstructured":"Viethen, J., Dale, R.: The use of spatial relations in referring expression generation. In: Proceedings of the Fifth International Natural Language Generation Conference, pp. 59\u201367. Association for Computational Linguistics (2008)","DOI":"10.3115\/1708322.1708334"},{"key":"5_CR33","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"issue":"1","key":"5_CR34","doi-asserted-by":"publisher","first-page":"1","DOI":"10.1016\/0010-0285(72)90002-3","volume":"3","author":"T Winograd","year":"1972","unstructured":"Winograd, T.: Understanding natural language. Cogn. Psychol. 3(1), 1\u2013191 (1972)","journal-title":"Cogn. Psychol."},{"key":"5_CR35","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46475-6_5","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T19:14:30Z","timestamp":1749582870000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46475-6_5"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464749","9783319464756"],"references-count":35,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46475-6_5","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}