{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,1,22]],"date-time":"2023-01-22T01:29:37Z","timestamp":1674350977215},"reference-count":23,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,3,31]],"date-time":"2017-03-31T00:00:00Z","timestamp":1490918400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2017,5]]},"DOI":"10.1007\/s11263-017-0993-y","type":"journal-article","created":{"date-parts":[[2017,3,31]],"date-time":"2017-03-31T12:20:03Z","timestamp":1490962803000},"page":"1-3","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Guest Editorial: Image and Language Understanding"],"prefix":"10.1007","volume":"123","author":[{"given":"Margaret","family":"Mitchell","sequence":"first","affiliation":[]},{"given":"John C.","family":"Platt","sequence":"additional","affiliation":[]},{"given":"Kate","family":"Saenko","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,3,31]]},"reference":[{"key":"993_CR1","doi-asserted-by":"publisher","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C. L., & Parikh, D. (2015). VQA: Visual question answering. In Proceedings of the IEEE international conference on computer vision (pp. 2425\u20132433).","DOI":"10.1109\/ICCV.2015.279"},{"key":"993_CR2","unstructured":"Devlin, J., Cheng, H., Fang, H., Gupta, S., Deng, L., He, X., Zweig, G., & Mitchell, M. (2015). Language models for image captioning: The quirks and what works. In Proceedings of the 53rd annual meeting of the association for computational linguistics and the 7th international joint conference on natural language processing (Vol. 2: short papers, pp. 100\u2013105). Association for Computational Linguistics, Beijing, China. http:\/\/www.aclweb.org\/anthology\/P15-2017 ."},{"key":"993_CR3","doi-asserted-by":"publisher","unstructured":"Donahue, J., Anne\u00a0Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., & Darrell, T. (2015). Long-term recurrent convolutional networks for visual recognition and description. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2625\u20132634).","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"993_CR4","doi-asserted-by":"publisher","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R. K., Deng, L., Doll\u00e1r, P., Gao, J., He, X., Mitchell, M., & Platt, J. C., et al. (2015). From captions to visual concepts and back. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 1473\u20131482).","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"993_CR5","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., & Saenko, K. (2013). Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In 2013 IEEE international conference on computer vision (ICCV) (pp. 2712\u20132719). IEEE.","DOI":"10.1109\/ICCV.2013.337"},{"key":"993_CR6","doi-asserted-by":"publisher","unstructured":"Jabri, A., Joulin, A., & van\u00a0der Maaten, L. (2016). Revisiting visual question answering baselines. In European conference on computer vision (pp. 727\u2013739). Berlin: Springer.","DOI":"10.1007\/978-3-319-46484-8_44"},{"key":"993_CR7","doi-asserted-by":"publisher","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3128\u20133137).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"993_CR8","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Advances in neural information processing systems (pp. 1097\u20131105)."},{"issue":"12","key":"993_CR9","doi-asserted-by":"publisher","first-page":"2891","DOI":"10.1109\/TPAMI.2012.162","volume":"35","author":"G Kulkarni","year":"2013","unstructured":"Kulkarni, G., Premraj, V., Ordonez, V., Dhar, S., Li, S., Choi, Y., et al. (2013). Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence, 35(12), 2891\u20132903.","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"issue":"1","key":"993_CR10","doi-asserted-by":"publisher","first-page":"98","DOI":"10.1109\/72.554195","volume":"8","author":"S Lawrence","year":"1997","unstructured":"Lawrence, S., Giles, C. L., Tsoi, A. C., & Back, A. D. (1997). Face recognition: A convolutional neural-network approach. IEEE Transactions on Neural Networks, 8(1), 98\u2013113.","journal-title":"IEEE Transactions on Neural Networks"},{"issue":"4","key":"993_CR11","doi-asserted-by":"publisher","first-page":"541","DOI":"10.1162\/neco.1989.1.4.541","volume":"1","author":"Y LeCun","year":"1989","unstructured":"LeCun, Y., Boser, B., Denker, J. S., Henderson, D., Howard, R. E., Hubbard, W., et al. (1989). Backpropagation applied to handwritten zip code recognition. Neural Computation, 1(4), 541\u2013551.","journal-title":"Neural Computation"},{"key":"993_CR12","unstructured":"Lin, T. Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., & Zitnick, C. L. (2014). Microsoft COCO: Common objects in context. In European conference on computer vision (pp. 740\u2013755). Berlin: Springer."},{"key":"993_CR13","unstructured":"Malinowski, M., & Fritz, M. (2014). A multi-world approach to question answering about real-world scenes based on uncertain input. In Advances in neural information processing systems (pp. 1682\u20131690)."},{"key":"993_CR14","unstructured":"Malisiewicz, T., & Efros, A. (2009). Beyond categories: The visual memex model for reasoning about object relationships. In Y.\u00a0Bengio, D.\u00a0Schuurmans, J. D. Lafferty, C. K. I. Williams, A.\u00a0Culotta (Eds.), Advances in neural information processing systems (Vol. 22, pp. 1222\u20131230)."},{"key":"993_CR15","unstructured":"Mikolov, T., Karafi\u00e1t, M., Burget, L., Cernock\u1ef3, J., & Khudanpur, S. (2010). Recurrent neural network based language model. In Interspeech (Vol. 2, pp. 1045\u20131048). Makuhari, Chiba: ISCA."},{"key":"993_CR16","unstructured":"Mitchell, M., Han, X., Dodge, J., Mensch, A., Goyal, A., Berg, A., Yamaguchi, K., Berg, T., Stratos, K., & Daum\u00e9\u00a0III, H. (2012). Midge: Generating image descriptions from computer vision detections. In Proceedings of the 13th conference of the European chapter of the association for computational linguistics (pp. 747\u2013756). Association for Computational Linguistics."},{"key":"993_CR17","unstructured":"Nowlan, S. J., & Platt, J. C. (1995). A convolutional neural network hand tracker. In Advances in neural information processing systems (pp. 901\u2013908)."},{"key":"993_CR18","doi-asserted-by":"publisher","unstructured":"Parikh, D., Zitnick, C. L., & Chen, T. (2008). From appearance to context-based recognition: Dense labeling in small images. In IEEE conference on computer vision and pattern recognition, 2008. CVPR 2008 (pp. 1\u20138). IEEE.","DOI":"10.1109\/CVPR.2008.4587595"},{"key":"993_CR19","unstructured":"Ren, M., Kiros, R., & Zemel, R. (2015). Exploring models and data for image question answering. In Advances in neural information processing systems (pp. 2953\u20132961)."},{"key":"993_CR20","unstructured":"Roberts, L. G. (1963). Machine perception of three-dimensional solids. Ph.D. thesis, MIT."},{"key":"993_CR21","doi-asserted-by":"crossref","unstructured":"Sadeghi, M. A., & Farhadi, A. (2011). Recognition using visual phrases. In 2011 IEEE conference on computer vision and pattern recognition (CVPR) (pp. 1745\u20131752). IEEE.","DOI":"10.1109\/CVPR.2011.5995711"},{"key":"993_CR22","doi-asserted-by":"publisher","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 3156\u20133164).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"993_CR23","doi-asserted-by":"crossref","unstructured":"Zitnick, C. L., Agrawal, A., Antol, S., Mitchell, M., Batra, D., & Parikh, D. (2016). Measuring machine intelligence through visual question answering. AI Magazine, 37(1).","DOI":"10.1609\/aimag.v37i1.2647"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-0993-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-017-0993-y\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-017-0993-y.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2019,9,20]],"date-time":"2019-09-20T09:12:00Z","timestamp":1568970720000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-017-0993-y"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,3,31]]},"references-count":23,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2017,5]]}},"alternative-id":["993"],"URL":"https:\/\/doi.org\/10.1007\/s11263-017-0993-y","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,3,31]]}}}