{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T14:12:16Z","timestamp":1742911936058,"version":"3.40.3"},"publisher-location":"Cham","reference-count":30,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030110178"},{"type":"electronic","value":"9783030110185"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-11018-5_14","type":"book-chapter","created":{"date-parts":[[2019,1,24]],"date-time":"2019-01-24T05:50:50Z","timestamp":1548309050000},"page":"153-161","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Knowing When to Look for What and Where: Evaluating Generation of Spatial Descriptions with Adaptive Attention"],"prefix":"10.1007","author":[{"ORCID":"https:\/\/orcid.org\/0000-0002-2598-5091","authenticated-orcid":false,"given":"Mehdi","family":"Ghanimifard","sequence":"first","affiliation":[]},{"ORCID":"https:\/\/orcid.org\/0000-0002-4019-7966","authenticated-orcid":false,"given":"Simon","family":"Dobnik","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2019,1,23]]},"reference":[{"key":"14_CR1","doi-asserted-by":"crossref","unstructured":"Agrawal, A., Batra, D., Parikh, D., Kembhavi, A.: Don\u2019t just assume; look and answer: overcoming priors for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4971\u20134980 (2018)","DOI":"10.1109\/CVPR.2018.00522"},{"key":"14_CR2","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., Klein, D.: Neural module networks. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 39\u201348 (2016)","DOI":"10.1109\/CVPR.2016.12"},{"key":"14_CR3","unstructured":"Ba, J., Mnih, V., Kavukcuoglu, K.: Multiple object recognition with visual attention. arXiv preprint arXiv:1412.7755 (2014)"},{"key":"14_CR4","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. arXiv preprint arXiv:1409.0473 (2014)"},{"issue":"8","key":"14_CR5","doi-asserted-by":"publisher","first-page":"1798","DOI":"10.1109\/TPAMI.2013.50","volume":"35","author":"Y Bengio","year":"2013","unstructured":"Bengio, Y., Courville, A., Vincent, P.: Representation learning: a review and new perspectives. IEEE Trans. Pattern Anal. Mach. Intell. 35(8), 1798\u20131828 (2013)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"14_CR6","volume-title":"Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit","author":"S Bird","year":"2009","unstructured":"Bird, S., Klein, E., Loper, E.: Natural Language Processing with Python: Analyzing Text with the Natural Language Toolkit. O\u2019Reilly Media Inc., Sebastopol (2009)"},{"key":"14_CR7","series-title":"Lecture Notes in Computer Science (Lecture Notes in Artificial Intelligence)","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1007\/978-3-540-32255-9_6","volume-title":"Spatial Cognition IV. Reasoning, Action, Interaction","author":"KR Coventry","year":"2005","unstructured":"Coventry, K.R., et al.: Spatial prepositions and vague quantifiers: implementing the functional geometric framework. In: Freksa, C., Knauff, M., Krieg-Br\u00fcckner, B., Nebel, B., Barkowsky, T. (eds.) Spatial Cognition 2004. LNCS (LNAI), vol. 3343, pp. 98\u2013110. Springer, Heidelberg (2005). https:\/\/doi.org\/10.1007\/978-3-540-32255-9_6"},{"key":"14_CR8","doi-asserted-by":"crossref","DOI":"10.4324\/9780203641521","volume-title":"Saying, Seeing, and Acting: The Psychological Semantics of Spatial Prepositions","author":"KR Coventry","year":"2004","unstructured":"Coventry, K.R., Garrod, S.C.: Saying, Seeing, and Acting: The Psychological Semantics of Spatial Prepositions. Psychology Press, Hove (2004)"},{"key":"14_CR9","doi-asserted-by":"crossref","unstructured":"Dobnik, S., Ghanimifard, M., Kelleher, J.D.: Exploring the functional and geometric bias of spatial relations using neural language models. In: Proceedings of the First International Workshop on Spatial Language Understanding (SpLU 2018) at NAACL-HLT 2018, pp. 1\u201311. Association for Computational Linguistics, New Orleans, 6 June 2018","DOI":"10.18653\/v1\/W18-1401"},{"key":"14_CR10","unstructured":"Dobnik, S., Kelleher, J.D.: Modular networks: an approach to the top-down versus bottom-up dilemma in natural language processing. In: Forthcoming in Post-proceedings of the Conference on Logic and Machine Learning in Natural Language (LaML), vol. 1, no. 1, pp. 1\u20138, 12\u201314 June 2017"},{"key":"14_CR11","volume-title":"Language and Spatial Cognition: An Interdisciplinary Study of the Prepositions in English","author":"A Herskovits","year":"1986","unstructured":"Herskovits, A.: Language and Spatial Cognition: An Interdisciplinary Study of the Prepositions in English. Cambridge University Press, Cambridge (1986)"},{"key":"14_CR12","unstructured":"Kelleher, J.D., Dobnik, S.: What is not where: the challenge of integrating spatial representations into deep learning architectures. CLASP Papers in Computational Linguistics, p. 41 (2017)"},{"issue":"02","key":"14_CR13","doi-asserted-by":"publisher","first-page":"217","DOI":"10.1017\/S0140525X00029733","volume":"16","author":"Barbara Landau","year":"1993","unstructured":"Landau, B., Jackendoff, R.: \u201cwhat\u201d and \u201cwhere\u201d in spatial language and spatial cognition. Behav. Brain Sci. 16(2), 217\u2013238, 255\u2013265 (1993)","journal-title":"Behavioral and Brain Sciences"},{"key":"14_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"14_CR15","doi-asserted-by":"crossref","unstructured":"Liu, C., Mao, J., Sha, F., Yuille, A.L.: Attention correctness in neural image captioning. In: AAAI, pp. 4176\u20134182 (2017)","DOI":"10.1609\/aaai.v31i1.11197"},{"key":"14_CR16","doi-asserted-by":"crossref","first-page":"493","DOI":"10.7551\/mitpress\/4107.003.0015","volume-title":"Language and Space","author":"GD Logan","year":"1996","unstructured":"Logan, G.D., Sadler, D.D.: A computational analysis of the apprehension of spatial relations. In: Bloom, P., Peterson, M.A., Nadel, L., Garrett, M.F. (eds.) Language and Space, pp. 493\u2013530. MIT Press, Cambridge (1996)"},{"key":"14_CR17","doi-asserted-by":"crossref","unstructured":"Lu, J., Xiong, C., Parikh, D., Socher, R.: Knowing when to look: adaptive attention via a visual sentinel for image captioning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), vol. 6 (2017)","DOI":"10.1109\/CVPR.2017.345"},{"key":"14_CR18","unstructured":"Mnih, V., Heess, N., Graves, A., et al.: Recurrent models of visual attention. In: Advances in Neural Information Processing Systems, pp. 2204\u20132212 (2014)"},{"key":"14_CR19","unstructured":"Park, D.H., Hendricks, L.A., Akata, Z., Schiele, B., Darrell, T., Rohrbach, M.: Attentive explanations: justifying decisions and pointing to the evidence. arXiv preprint arXiv:1612.04757 (2016)"},{"key":"14_CR20","doi-asserted-by":"crossref","unstructured":"Ramisa, A., Wang, J., Lu, Y., Dellandrea, E., Moreno-Noguer, F., Gaizauskas, R.: Combining geometric, textual and visual features for predicting prepositions in image descriptions. In: Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing, pp. 214\u2013220 (2015)","DOI":"10.18653\/v1\/D15-1022"},{"key":"14_CR21","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/3608.001.0001","volume-title":"The Human Semantic Potential: Spatial Language and Constrained Connectionism","author":"T Regier","year":"1996","unstructured":"Regier, T.: The Human Semantic Potential: Spatial Language and Constrained Connectionism. MIT Press, Cambridge (1996)"},{"key":"14_CR22","doi-asserted-by":"crossref","unstructured":"Ribeiro, M.T., Singh, S., Guestrin, C.: Why should i trust you?: explaining the predictions of any classifier. In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 1135\u20131144. ACM (2016)","DOI":"10.1145\/2939672.2939778"},{"key":"14_CR23","doi-asserted-by":"crossref","unstructured":"Selvaraju, R.R., Cogswell, M., Das, A., Vedantam, R., Parikh, D., Batra, D., et al.: Grad-CAM: visual explanations from deep networks via gradient-based localization. In: ICCV, pp. 618\u2013626 (2017)","DOI":"10.1109\/ICCV.2017.74"},{"key":"14_CR24","unstructured":"Shekhar, R., Pezzelle, S., Herbelot, A., Nabi, M., Sangineto, E., Bernardi, R.: Vision and language integration: moving beyond objects. In: IWCS 2017\u201312th International Conference on Computational Semantics\u2013Short papers (2017)"},{"key":"14_CR25","doi-asserted-by":"crossref","unstructured":"Shekhar, R., et al.: FOIL it! find one mismatch between image and language caption. In: Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (ACL) (Long Papers), vol. 1, pp. 255\u2013265 (2017)","DOI":"10.18653\/v1\/P17-1024"},{"key":"14_CR26","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems, pp. 3104\u20133112 (2014)"},{"key":"14_CR27","doi-asserted-by":"crossref","unstructured":"Szegedy, C., et al.: Going deeper with convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"14_CR28","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: 2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3156\u20133164. IEEE (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"14_CR29","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057 (2015)"},{"key":"14_CR30","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018 Workshops"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-11018-5_14","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,7,14]],"date-time":"2024-07-14T08:43:36Z","timestamp":1720946616000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-11018-5_14"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030110178","9783030110185"],"references-count":30,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-11018-5_14","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"23 January 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}