{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,10,8]],"date-time":"2025-10-08T15:45:00Z","timestamp":1759938300224,"version":"3.40.3"},"publisher-location":"Cham","reference-count":63,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783319464749"},{"type":"electronic","value":"9783319464756"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46475-6_17","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T08:48:10Z","timestamp":1474015690000},"page":"261-277","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":39,"title":["Leveraging Visual Question Answering for Image-Caption Ranking"],"prefix":"10.1007","author":[{"given":"Xiao","family":"Lin","sequence":"first","affiliation":[]},{"given":"Devi","family":"Parikh","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"key":"17_CR1","doi-asserted-by":"crossref","unstructured":"Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L., Parikh, D.: VQA: visual question answering. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.279"},{"key":"17_CR2","doi-asserted-by":"crossref","unstructured":"Antol, S., Zitnick, C.L., Parikh, D.: Zero-shot learning via visual abstraction. In: ECCV (2014)","DOI":"10.1007\/978-3-319-10593-2_27"},{"key":"17_CR3","doi-asserted-by":"crossref","unstructured":"Berg, T., Belhumeur, P.N.: POOF: part-based one-vs.-one features for fine-grained categorization, face verification, and attribute estimation. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.128"},{"key":"17_CR4","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"168","DOI":"10.1007\/978-3-642-15567-3_13","volume-title":"Computer Vision \u2013 ECCV 2010","author":"L Bourdev","year":"2010","unstructured":"Bourdev, L., Maji, S., Brox, T., Malik, J.: Detecting people using mutually consistent poselet activations. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part VI. LNCS, vol. 6316, pp. 168\u2013181. Springer, Heidelberg (2010)"},{"key":"17_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"438","DOI":"10.1007\/978-3-642-15561-1_32","volume-title":"Computer Vision \u2013 ECCV 2010","author":"S Branson","year":"2010","unstructured":"Branson, S., Wah, C., Schroff, F., Babenko, B., Welinder, P., Perona, P., Belongie, S.: Visual recognition with humans in the loop. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol. 6314, pp. 438\u2013451. Springer, Heidelberg (2010)"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Chen, X., Lawrence Zitnick, C.: Mind\u2019s eye: a recurrent visual representation for image caption generation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"17_CR7","unstructured":"Cho, K., van Merri\u00ebnboer, B., Bahdanau, D., Bengio, Y.: On the properties of neural machine translation: encoder-decoder approaches (2014). arXiv preprint \n                      arXiv:1409.1259"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: ImageNet: a large-scale hierarchical image database. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"17_CR9","doi-asserted-by":"crossref","unstructured":"Doersch, C., Gupta, A., Efros, A.A.: Unsupervised visual representation learning by context prediction. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.167"},{"key":"17_CR10","doi-asserted-by":"crossref","unstructured":"Donahue, J., Grauman, K.: Annotator rationales for visual recognition. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126394"},{"key":"17_CR11","unstructured":"Donahue, J., Jia, Y., Vinyals, O., Hoffman, J., Zhang, N., Tzeng, E., Darrell, T.: DeCAF: a deep convolutional activation feature for generic visual recognition (2013). arXiv preprint \n                      arXiv:1310.1531"},{"key":"17_CR12","doi-asserted-by":"crossref","unstructured":"Elhoseiny, M., Saleh, B., Elgammal, A.: Write a classifier: zero-shot learning using purely textual descriptions. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.321"},{"key":"17_CR13","doi-asserted-by":"crossref","unstructured":"Elliott, D., Keller, F.: Comparing automatic evaluation measures for image description. In: Proceedings of 52nd Annual Meeting of the Association for Computational Linguistics, pp. 452\u2013457 (2014)","DOI":"10.3115\/v1\/P14-2074"},{"key":"17_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Endres, I., Hoiem, D., Forsyth, D.: Describing objects by their attributes. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206772"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Fouhey, D.F., Zitnick, C.L.: Predicting object dynamics in scenes. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.260"},{"key":"17_CR17","unstructured":"Gao, H., Mao, J., Zhou, J., Huang, Z., Wang, L., Xu, W.: Are you talking to a machine? Dataset and methods for multilingual image question answering. In: NIPS (2015)"},{"key":"17_CR18","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"16","DOI":"10.1007\/978-3-540-88682-2_3","volume-title":"Computer Vision \u2013 ECCV 2008","author":"A Gupta","year":"2008","unstructured":"Gupta, A., Davis, L.S.: Beyond nouns: exploiting prepositions and comparative adjectives for learning visual classifiers. In: Forsyth, D., Torr, P., Zisserman, A. (eds.) ECCV 2008, Part I. LNCS, vol. 5302, pp. 16\u201329. Springer, Heidelberg (2008)"},{"key":"17_CR19","unstructured":"Hamrick, J., Battaglia, P., Tenenbaum, J.B.: Internal physics models guide probabilistic judgments about object dynamics. In: Proceedings of 33rd Annual Meeting of the Cognitive Science Society, Boston, MA (2011)"},{"issue":"8","key":"17_CR20","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"17_CR21","doi-asserted-by":"crossref","unstructured":"Johnson, J., Krishna, R., Stark, M., Li, L.J., Shamma, D., Bernstein, M., Fei-Fei, L.: Image retrieval using scene graphs. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"17_CR22","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"17_CR23","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.: Unifying visual-semantic embeddings with multimodal neural language models. In: TACL (2015)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Klein, B., Lev, G., Sadeh, G., Wolf, L.: Associating neural word embeddings with deep image representations using Fisher vectors. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Kovashka, A., Parikh, D., Grauman, K.: WhittleSearch: image search with relative attribute feedback. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6248026"},{"key":"17_CR26","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: NIPS (2012)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., Li, S., Choi, Y., Berg, A.C., Berg, T.L.: Baby talk: understanding and generating simple image descriptions. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Kumar, N., Berg, A.C., Belhumeur, P.N., Nayar, S.K.: Describable visual attributes for face verification and image search. In: IEEE TPAMI (2011)","DOI":"10.1109\/TPAMI.2011.48"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Lampert, C.H., Nickisch, H., Harmeling, S.: Learning to detect unseen object classes by between-class attribute transfer. In: CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206594"},{"key":"17_CR30","unstructured":"Li, L.J., Su, H., Fei-Fei, L., Xing, E.P.: Object bank: a high-level image representation for scene classification and semantic feature sparsification. In: NIPS (2010)"},{"key":"17_CR31","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Heidelberg (2014)"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Lin, X., Parikh, D.: Don\u2019t just listen, use your imagination: leveraging visual common sense for non-visual tasks. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298917"},{"key":"17_CR33","unstructured":"Ma, L., Lu, Z., Li, H.: Learning to answer questions from image using convolutional neural network (2015). arXiv preprint \n                      arXiv:1506.00333"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., Shang, L., Li, H.: Multimodal convolutional neural networks for matching image and sentence. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.301"},{"key":"17_CR35","unstructured":"Malinowski, M., Fritz, M.: A multi-world approach to question answering about real-world scenes based on uncertain input. In: NIPS (2014)"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Malinowski, M., Rohrbach, M., Fritz, M.: Ask your neurons: a neural-based approach to answering questions about images. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.9"},{"key":"17_CR37","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., Yuille, A.: Deep captioning with multimodal recurrent neural networks (M-RNN). In: ICLR (2015)"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Parikh, D., Grauman, K.: Relative attributes. In: ICCV (2011)","DOI":"10.1109\/ICCV.2011.6126281"},{"key":"17_CR39","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"354","DOI":"10.1007\/978-3-642-33712-3_26","volume-title":"Computer Vision \u2013 ECCV 2012","author":"A Parkash","year":"2012","unstructured":"Parkash, A., Parikh, D.: Attributes for Classifier Feedback. In: Fitzgibbon, A., Lazebnik, S., Perona, P., Sato, Y., Schmid, C. (eds.) ECCV 2012, Part III. LNCS, vol. 7574, pp. 354\u2013368. Springer, Heidelberg (2012)"},{"key":"17_CR40","unstructured":"Pirsiavash, H., Vondrick, C., Torralba, A.: Inferring the why in images. CoRR abs\/1406.5472 (2014). \n                      http:\/\/arXiv.org\/abs\/1406.5472"},{"key":"17_CR41","unstructured":"Ray, A., Christie, G., Bansal, M., Batra, D., Parikh, D.: Question relevance in VQA: identifying non-visual and false-premise questions (2016). arXiv preprint \n                      arXiv:1606.06622"},{"key":"17_CR42","unstructured":"Ren, M., Kiros, R., Zemel, R.: Exploring models and data for image question answering. In: NIPS (2015)"},{"key":"17_CR43","doi-asserted-by":"crossref","unstructured":"Sadanand, S., Corso, J.J.: Action bank: a high-level representation of activity in video. In: CVPR (2012)","DOI":"10.1109\/CVPR.2012.6247806"},{"key":"17_CR44","doi-asserted-by":"crossref","unstructured":"Sadeghi, F., Divvala, S.K., Farhadi, A.: VisKE: visual knowledge extraction and question answering by visual verification of relation phrases. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298752"},{"key":"17_CR45","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. CoRR abs\/1409.1556 (2014)"},{"key":"17_CR46","unstructured":"Socher, R., Ganjoo, M., Manning, C.D., Ng, A.: Zero-shot learning through cross-modal transfer. In: NIPS (2013)"},{"key":"17_CR47","first-page":"1929","volume":"15","author":"N Srivastava","year":"2014","unstructured":"Srivastava, N., Hinton, G., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.: Dropout: a simple way to prevent neural networks from overfitting. JMLR 15, 1929\u20131958 (2014)","journal-title":"JMLR"},{"key":"17_CR48","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: NIPS (2014)"},{"key":"17_CR49","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"17_CR50","doi-asserted-by":"crossref","unstructured":"Tang, K., Paluri, M., Fei-fei, L., Fergus, R., Bourdev, L.: Improving image classification with location context. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.121"},{"key":"17_CR51","doi-asserted-by":"crossref","unstructured":"Vedantum, R., Lin, X., Batra, T., Zitnick, C.L., Parikh, D.: Learning common sense through visual abstraction. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.292"},{"key":"17_CR52","doi-asserted-by":"crossref","unstructured":"Vedantum, R., Zitnick, C.L., Parikh, D.: Cider: Consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"17_CR53","unstructured":"Vondrick, C., Pirsiavash, H., Torralba, A.: Anticipating the future by watching unlabeled video (2015). arXiv preprint \n                      arXiv:1504.08023"},{"key":"17_CR54","doi-asserted-by":"crossref","unstructured":"Walker, J., Gupta, A., Hebert, M.: Patch to the future: unsupervised visual prediction. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.416"},{"key":"17_CR55","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"155","DOI":"10.1007\/978-3-642-15555-0_12","volume-title":"Computer Vision \u2013 ECCV 2010","author":"Y Wang","year":"2010","unstructured":"Wang, Y., Mori, G.: A discriminative latent model of object classes and attributes. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part V. LNCS, vol. 6315, pp. 155\u2013168. Springer, Heidelberg (2010)"},{"key":"17_CR56","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhutdinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"17_CR57","unstructured":"Yu, L., Park, E., Berg, A.C., Berg, T.L.: Visual madlibs: fill in the blank image generation and question answering (2015). arXiv preprint \n                      arXiv:1506.00278"},{"key":"17_CR58","doi-asserted-by":"crossref","unstructured":"Zhang, N., Farrell, R., Iandola, F., Darrell, T.: Deformable part descriptors for fine-grained recognition and attribute prediction. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.96"},{"key":"17_CR59","doi-asserted-by":"crossref","unstructured":"Zhang, N., Paluri, M., Ranzato, M., Darrell, T., Bourdev, L.: PANDA: pose aligned networks for deep attribute modeling. In: CVPR (2014)","DOI":"10.1109\/CVPR.2014.212"},{"key":"17_CR60","doi-asserted-by":"crossref","unstructured":"Zheng, B., Zhao, Y., Yu, J., Ikeuchi, K., Zhu, S.C.: Beyond point clouds: scene understanding by reasoning geometry and physics. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.402"},{"key":"17_CR61","unstructured":"Zhou, B., Tian, Y., Sukhbaatar, S., Szlam, A., Fergus, R.: Simple baseline for visual question answering (2015). arXiv preprint \n                      arXiv:1512.02167"},{"key":"17_CR62","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"408","DOI":"10.1007\/978-3-319-10605-2_27","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y Zhu","year":"2014","unstructured":"Zhu, Y., Fathi, A., Fei-Fei, L.: Reasoning about object affordances in a knowledge base representation. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part II. LNCS, vol. 8690, pp. 408\u2013424. Springer, Heidelberg (2014)"},{"key":"17_CR63","unstructured":"Zhu, Y., Zhang, C., Re, C., Fei-Fei, L.: Building a large-scale multimodal knowledge base for visual question answering (2013). arXiv preprint \n                      arXiv:1310.1531"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46475-6_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,10]],"date-time":"2020-10-10T01:02:39Z","timestamp":1602291759000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46475-6_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464749","9783319464756"],"references-count":63,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46475-6_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}