{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,15]],"date-time":"2026-05-15T15:54:45Z","timestamp":1778860485000,"version":"3.51.4"},"reference-count":89,"publisher":"Springer Science and Business Media LLC","issue":"1-3","license":[{"start":{"date-parts":[[2017,8,29]],"date-time":"2017-08-29T00:00:00Z","timestamp":1503964800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"funder":[{"name":"German Academic Exchange Service (DAAD)"},{"name":"Deutsche Forschungsgemeinschaft (DE)","award":["Collaborative Research Center (CRC) 1223"],"award-info":[{"award-number":["Collaborative Research Center (CRC) 1223"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2017,12]]},"DOI":"10.1007\/s11263-017-1038-2","type":"journal-article","created":{"date-parts":[[2017,8,29]],"date-time":"2017-08-29T15:11:11Z","timestamp":1504019471000},"page":"110-135","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":70,"title":["Ask Your Neurons: A Deep Learning Approach to Visual Question Answering"],"prefix":"10.1007","volume":"125","author":[{"given":"Mateusz","family":"Malinowski","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Mario","family":"Fritz","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,8,29]]},"reference":[{"key":"1038_CR1","doi-asserted-by":"crossref","unstructured":"Akata, Z., Malinowski, M., Fritz, M., & Schiele, B. (2016). Multi-cue zero-shot learning with strong supervision. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.14"},{"key":"1038_CR2","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., & Klein, D. (2016a). Neural module networks. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.12"},{"key":"1038_CR3","doi-asserted-by":"crossref","unstructured":"Andreas, J., Rohrbach, M., Darrell, T., & Klein, D. (2016b). Learning to compose neural networks for question answering. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","DOI":"10.18653\/v1\/N16-1181"},{"key":"1038_CR4","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Lawrence Zitnick, C., & Parikh, D. (2015) Vqa: Visual question answering. In Proceedings of the IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.279"},{"key":"1038_CR5","unstructured":"Bastien, F., Lamblin, P., Pascanu, R., Bergstra, J., Goodfellow, I., Bergeron, A., Bouchard, N., & Bengio, Y. (2012) Theano: new features and speed improvements. Deep Learning and Unsupervised Feature Learning NIPS 2012 Workshop."},{"key":"1038_CR6","doi-asserted-by":"crossref","unstructured":"Berant, J., & Liang, P. (2014) Semantic parsing via paraphrasing. In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL).","DOI":"10.3115\/v1\/P14-1133"},{"key":"1038_CR7","unstructured":"Chen, K., Wang, J., Chen, L. C., Gao, H., Xu, W., & Nevatia, R. (2015) Abc-cnn: An attention based convolutional neural network for visual question answering. arXiv:1511.05960 ."},{"key":"1038_CR8","doi-asserted-by":"crossref","unstructured":"Cho, K., van Merrienboer, B., Gulcehre, C., Bougares, F., Schwenk, H., Bahdanau, D., & Bengio, Y. (2014) Learning phrase representations using rnn encoder-decoder for statistical machine translation. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.3115\/v1\/D14-1179"},{"key":"1038_CR9","unstructured":"Chollet, F. (2015) keras. https:\/\/github.com\/fchollet\/keras ."},{"issue":"1","key":"1038_CR10","doi-asserted-by":"crossref","first-page":"37","DOI":"10.1177\/001316446002000104","volume":"20","author":"J Cohen","year":"1960","unstructured":"Cohen, J., et al. (1960). A coefficient of agreement for nominal scales. Educational and psychological measurement, 20(1), 37\u201346.","journal-title":"Educational and psychological measurement"},{"key":"1038_CR11","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., & Darrell, T. (2015). Long-term recurrent convolutional networks for visual recognition and description. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298878"},{"issue":"3","key":"1038_CR12","doi-asserted-by":"crossref","first-page":"613","DOI":"10.1177\/001316447303300309","volume":"33","author":"JL Fleiss","year":"1973","unstructured":"Fleiss, J. L., & Cohen, J. (1973). The equivalence of weighted kappa and the intraclass correlation coefficient as measures of reliability. Educational and psychological measurement, 33(3), 613\u2013619.","journal-title":"Educational and psychological measurement"},{"key":"1038_CR13","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D. H., Yang, D., Rohrbach, A., Darrell, T., & Rohrbach, M. (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.18653\/v1\/D16-1044"},{"key":"1038_CR14","unstructured":"Gao, H., Mao, J., Zhou, J., Huang, Z., Wang, L., Xu, W. (2015) Are you talking to a machine? dataset and methods for multilingual image question answering. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR15","doi-asserted-by":"crossref","unstructured":"Geman, D., Geman, S., Hallonquist, N., & Younes, L. (2015). Visual turing test for computer vision systems. In Proceedings of the National Academy of Sciences. National Academy of Sciences.","DOI":"10.1073\/pnas.1422953112"},{"key":"1038_CR16","unstructured":"He, K., Zhang, X., Ren, S., & Sun, J. (2015). Deep residual learning for image recognition. arXiv:1512.03385 ."},{"issue":"8","key":"1038_CR17","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"1038_CR18","doi-asserted-by":"crossref","unstructured":"Hu, R., Rohrbach, M., & Darrell, T. (2016a). Segmentation from natural language expressions. In Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-319-46448-0_7"},{"key":"1038_CR19","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., & Darrell, T. (2016b). Natural language object retrieval. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.493"},{"key":"1038_CR20","unstructured":"Ilievski, I., Yan, S., & Feng, J. (2016). A focused dynamic attention model for visual question answering. arXiv:1604.01485 ."},{"key":"1038_CR21","doi-asserted-by":"crossref","unstructured":"Iyyer, M., Boyd-Graber, J., Claudino, L., Socher, R., & Daum\u00e9 III, H. (2014). A neural network for factoid question answering over paragraphs. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.3115\/v1\/D14-1070"},{"key":"1038_CR22","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., & Darrell, T. (2014). Caffe: Convolutional architecture for fast feature embedding. arXiv:1408.5093 ."},{"key":"1038_CR23","unstructured":"Jiang, A., Wang, F., Porikli, F., & Li, Y. (2015). Compositional memory for visual question answering. arXiv:1511.05676 ."},{"key":"1038_CR24","doi-asserted-by":"crossref","unstructured":"Kafle, K., & Kanan, C. (2016). Answer-type prediction for visual question answering. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.538"},{"key":"1038_CR25","doi-asserted-by":"crossref","unstructured":"Kalchbrenner, N., Grefenstette, E., & Blunsom, P. (2014). A convolutional neural network for modelling sentences. In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL).","DOI":"10.3115\/v1\/P14-1062"},{"key":"1038_CR26","doi-asserted-by":"crossref","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"1038_CR27","unstructured":"Karpathy, A., Joulin, A., & Fei-Fei, L. (2014). Deep fragment embeddings for bidirectional image sentence mapping. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR28","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., & Berg, Tamara\u00a0L. (2014). Referit game: Referring to objects in photographs of natural scenes. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.3115\/v1\/D14-1086"},{"key":"1038_CR29","doi-asserted-by":"crossref","unstructured":"Kim, Y. (2014) Convolutional neural networks for sentence classification. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.3115\/v1\/D14-1181"},{"key":"1038_CR30","unstructured":"Kim, J. H., On, K. W., Kim, J., Ha, J. W., & Zhang, B. T. (2016). Hadamard product for low-rank bilinear pooling. arXiv preprint arXiv:1610.04325 ."},{"key":"1038_CR31","unstructured":"Kingma, D., & Ba, J. (2014). Adam: A method for stochastic optimization. arXiv:1412.6980 ."},{"key":"1038_CR32","doi-asserted-by":"crossref","unstructured":"Klein, D., & Manning, C. D. (2003). Accurate unlexicalized parsing. In Proceedings of the 41st Annual Meeting on Association for Computational Linguistics-Volume 1, (pp. 423\u2013430). Association for Computational Linguistics.","DOI":"10.3115\/1075096.1075150"},{"key":"1038_CR33","doi-asserted-by":"crossref","unstructured":"Kong, C., Lin, D., Bansal, M., Urtasun, R., & Fidler, S. (2014). What are you talking about? text-to-image coreference. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2014.455"},{"key":"1038_CR34","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L. J., Shamma, D. A., Bernstein, M., & Fei-Fei, L. (2016). Visual genome: Connecting language and vision using crowdsourced dense image annotations. arXiv:1602.07332 ."},{"key":"1038_CR35","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1162\/tacl_a_00220","volume":"1","author":"J Krishnamurthy","year":"2013","unstructured":"Krishnamurthy, J., & Kollar, T. (2013). Jointly learning to parse and perceive: Connecting natural language to the physical world. Transactions of the Association for Computational Linguistics (TACL), 1, 193\u2013206.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"key":"1038_CR36","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR37","unstructured":"Kumar, A., Irsoy, O., Su, J., Bradbury, J., English, R., Pierce, B., Ondruska, P., Gulrajani, I., & Socher, R. (2015). Ask me anything: Dynamic memory networks for natural language processing. arXiv preprint arXiv:1506.07285 ."},{"key":"1038_CR38","doi-asserted-by":"crossref","unstructured":"LeCun,Y., Bottou, L\u00e9on, Bengio, Y., & Haffner, P. (1998). Gradient-based learning applied to document recognition. Proceedings of the IEEE.","DOI":"10.1109\/5.726791"},{"issue":"2","key":"1038_CR39","doi-asserted-by":"crossref","first-page":"389","DOI":"10.1162\/COLI_a_00127","volume":"39","author":"P Liang","year":"2013","unstructured":"Liang, P., Jordan, M. I., & Klein, D. (2013). Learning dependency-based compositional semantics. Computational Linguistics, 39(2), 389\u2013446.","journal-title":"Computational Linguistics"},{"key":"1038_CR40","doi-asserted-by":"crossref","unstructured":"Lin, T. Y, Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft coco: Common objects in context. In Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"1038_CR41","unstructured":"Lu, J., Yang, J., Batra, D., & Parikh, D. (2016). Hierarchical co-attention for visual question answering. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR42","doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., & Li, H. (2016). Learning to answer questions from image using convolutional neural network. In Proceedings of the Conference on Artificial Intelligence (AAAI).","DOI":"10.1609\/aaai.v30i1.10442"},{"key":"1038_CR43","unstructured":"Malinowski, M., & Fritz, M. (2014a). A multi-world approach to question answering about real-world scenes based on uncertain input. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR44","unstructured":"Malinowski, M., & Fritz, M. (2014b). Towards a visual turing challenge. In Learning Semantics (NIPS workshop)."},{"key":"1038_CR45","unstructured":"Malinowski, M., & Fritz, M. (2014c). A pooling approach to modelling spatial relations for image retrieval and annotation. arXiv:1411.5190 ."},{"key":"1038_CR46","unstructured":"Malinowski, M., & Fritz, M. (2015). Hard to cheat: A turing test based on answering questions about images. AAAIWorkshop: Beyond the Turing Test."},{"key":"1038_CR47","unstructured":"Malinowski, M., & Fritz, M. (2016). Tutorial on answering questions about images with deep learning. arXiv preprint arXiv:1610.01076 ."},{"key":"1038_CR48","doi-asserted-by":"crossref","unstructured":"Malinowski, M, Rohrbach, M, & Fritz, M. (2015). Ask your neurons: A neural-based approach to answering questions about images. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), (pp. 1\u20139).","DOI":"10.1109\/ICCV.2015.9"},{"key":"1038_CR49","volume-title":"Foundations of statistical natural language processing","author":"CD Manning","year":"1999","unstructured":"Manning, C. D., & Sch\u00fctze, H. (1999). Foundations of statistical natural language processing (Vol. 999). Cambridge: MIT Press."},{"key":"1038_CR50","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., & Murphy, K. (2016). Generation and comprehension of unambiguous object descriptions. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.9"},{"key":"1038_CR51","unstructured":"Matuszek, C., Fitzgerald, N., Zettlemoyer, L., Bo, L., & Fox, D. (2012). A joint model of language and perception for grounded attribute learning. In Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"1038_CR52","doi-asserted-by":"crossref","unstructured":"Nag Chowdhury, S., Malinowski, M., Bulling, A., & Fritz, M. (2016) Xplore-m-ego: Contextual media retrieval using natural language queries. In ACM International Conference on Multimedia Retrieval (ICMR).","DOI":"10.1145\/2911996.2912044"},{"key":"1038_CR53","unstructured":"Nakashole, N., Tylenda, T., & Weikum, T. (2013). Fine-grained semantic typing of emerging entities. In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL)."},{"key":"1038_CR54","unstructured":"Noh, H., Seo, P. H., & Han, B. (2015). Image question answering using convolutional neural network with dynamic parameter prediction. arXiv:1511.05756 ."},{"key":"1038_CR55","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., & Manning, C. D. (2014). Glove: Global vectors for word representation. In Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP).","DOI":"10.3115\/v1\/D14-1162"},{"key":"1038_CR56","doi-asserted-by":"crossref","unstructured":"Plummer, B., Wang, L., Cervantes, C., Caicedo, J., Hockenmaier, J., & Lazebnik, S. (2015). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. In Proceedings of the IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.303"},{"key":"1038_CR57","unstructured":"Plummer, B., Wang, L., Cervantes, C., Caicedo, J., Hockenmaier, J., & Lazebnik, S. (2016). Flickr30k entities: Collecting region-to-phrase correspondences for richer image-to-sentence models. arXiv:1505.04870 ."},{"key":"1038_CR58","unstructured":"Prakash, A., & Storer, J. (2016). Highway networks for visual question answering."},{"key":"1038_CR59","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1162\/tacl_a_00207","volume":"1","author":"M Regneri","year":"2013","unstructured":"Regneri, M., Rohrbach, M., Wetzel, D., Thater, S., Schiele, B., & Pinkal, M. (2013). Grounding action descriptions in videos. Transactions of the Association for Computational Linguistics (TACL), 1, 25\u201336.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"issue":"2","key":"1038_CR60","first-page":"5","volume":"1","author":"M Ren","year":"2015","unstructured":"Ren, M., Kiros, R., & Zemel, R. (2015). Image question answering: A visual semantic embedding model and a new dataset. Advances in Neural Information Processing Systems (NIPS), 1(2), 5.","journal-title":"Advances in Neural Information Processing Systems (NIPS)"},{"key":"1038_CR61","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Hu, R., Darrell, T., & Schiele, B. (2015a). Grounding of textual phrases in images by reconstruction. In Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-319-46448-0_49"},{"key":"1038_CR62","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., & Schiele, B. (2015b). A dataset for movie description. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"1038_CR63","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A. C., & Fei-Fei, L. (2014). Imagenet large scale visual recognition challenge. arXiv:1409.0575 ."},{"key":"1038_CR64","unstructured":"Saito, K., Shin, A., Ushiku, Y., & Harada, T. (2016). Dualnet: Domain-invariant network for visual question answering. arXiv preprint arXiv:1606.06108 ."},{"key":"1038_CR65","doi-asserted-by":"crossref","unstructured":"Shih, K. J., Singh, S., & Hoiem, D. (2016). Where to look: Focus regions for visual question answering. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.499"},{"key":"1038_CR66","doi-asserted-by":"crossref","unstructured":"Silberman, N., Hoiem, D., Kohli, P., & Fergus, R. (2012). Indoor segmentation and support inference from rgbd images. In Proceedings of the European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-642-33715-4_54"},{"key":"1038_CR67","unstructured":"Simonyan, K., & Zisserman, A. ((2014)) Very deep convolutional networks for large-scale image recognition. arXiv:1409.1556 ."},{"key":"1038_CR68","unstructured":"Sutskever, I., Vinyals, O., & Le, Q. V. (2014). Sequence to sequence learning with neural networks. In Advances in Neural Information Processing Systems (NIPS)."},{"key":"1038_CR69","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., & Rabinovich, A. (2014). Going deeper with convolutions. arXiv:1409.4842 ."},{"key":"1038_CR70","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., & Fidler, S. (2016). Movieqa: Understanding stories in movies through question-answering. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)","DOI":"10.1109\/CVPR.2016.501"},{"key":"1038_CR71","unstructured":"Trecvid (2014). Trecvid med 14. http:\/\/nist.gov\/itl\/iad\/mig\/med14.cfm ."},{"key":"1038_CR72","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., & Saenko, K. (2015a). Sequence to sequence\u2013video to text. In Proceedings of the IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.515"},{"key":"1038_CR73","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., & Saenko, K. (2015b). Translating videos to natural language using deep recurrent neural networks. In Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).","DOI":"10.3115\/v1\/N15-1173"},{"key":"1038_CR74","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2014). Show and tell: A neural image caption generator. arXiv:1411.4555 ."},{"key":"1038_CR75","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., & Lazebnik, S. (2016). Learning deep structure-preserving image-text embeddings. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.541"},{"key":"1038_CR76","unstructured":"Weston, J., Chopra, S., & Bordes, A. (2014). Memory networks. arXiv:1410.3916 ."},{"key":"1038_CR77","doi-asserted-by":"crossref","unstructured":"Wu, Z., & Palmer, M. (1994). Verbs semantics and lexical selection. In Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL).","DOI":"10.3115\/981732.981751"},{"key":"1038_CR78","doi-asserted-by":"crossref","unstructured":"Wu, Q., Wang, P., Shen, C., van den Hengel, A., & Dick, A. (2016). Ask Me Anything: Free-form Visual Question Answering Based on Knowledge from External Sources. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.500"},{"key":"1038_CR79","unstructured":"Xiong, C., Merity, S., & Socher, R. (2016). Dynamic memory networks for visual and textual question answering. arXiv preprint arXiv:1603.01417 ."},{"key":"1038_CR80","unstructured":"Xu, H., & Saenko, K. (2015). Ask, attend and answer: Exploring question-guided spatial attention for visual question answering. arXiv:1511.05234 ."},{"key":"1038_CR81","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., & Bengio, Y. (2015). Show, attend and tell: Neural image caption generation with visual attention. Proceedings of the International Conference on Machine Learning (ICML)."},{"key":"1038_CR82","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., & Smola, A. (2015). Stacked attention networks for image question answering. arXiv:1511.02274 ."},{"key":"1038_CR83","doi-asserted-by":"crossref","unstructured":"Yu, L., Park, E., Berg, A. C., & Berg, T. L. (2015). Visual madlibs: Fill in the blank description generation and question answering. In Proceedings of the IEEE International Conference on Computer Vision (ICCV), pages 2461\u20132469.","DOI":"10.1109\/ICCV.2015.283"},{"key":"1038_CR84","doi-asserted-by":"crossref","unstructured":"Yu, L., Poirson, P., Yang, S., Berg, A. C., & Berg, T. L. (2016). Modeling context in referring expressions. In European Conference on Computer Vision, pages 69\u201385. Springer.","DOI":"10.1007\/978-3-319-46475-6_5"},{"key":"1038_CR85","unstructured":"Zaremba, W., & Sutskever, I. (2014). Learning to execute. arXiv preprint arXiv:1410.4615 ."},{"key":"1038_CR86","unstructured":"Zhou, B., Tian, Y., Sukhbaatar, S., Szlam, A., & Fergus, R. (2015). Simple baseline for visual question answering. arXiv:1512.02167 ."},{"key":"1038_CR87","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., & Fei-Fei, L. (2016). Visual7W: Grounded question answering in images. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2016.540"},{"key":"1038_CR88","unstructured":"Zhu, L., Xu, Z., Yang, Y., & Hauptmann, A. G. (2015). Uncovering temporal context for video question and answering. arXiv:1511.04670 ."},{"key":"1038_CR89","doi-asserted-by":"crossref","unstructured":"Zitnick, C. L., Parikh, D., & Vanderwende, L. (2013). Learning the visual interpretation of sentences. In Proceedings of the IEEE International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2013.211"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-017-1038-2\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1038-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-1038-2.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,1]],"date-time":"2022-08-01T22:25:27Z","timestamp":1659392727000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-017-1038-2"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,8,29]]},"references-count":89,"journal-issue":{"issue":"1-3","published-print":{"date-parts":[[2017,12]]}},"alternative-id":["1038"],"URL":"https:\/\/doi.org\/10.1007\/s11263-017-1038-2","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,8,29]]}}}