{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,21]],"date-time":"2026-05-21T17:08:19Z","timestamp":1779383299128,"version":"3.53.1"},"publisher-location":"Cham","reference-count":53,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319464473","type":"print"},{"value":"9783319464480","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46448-0_49","type":"book-chapter","created":{"date-parts":[[2016,9,16]],"date-time":"2016-09-16T08:27:24Z","timestamp":1474014444000},"page":"817-834","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":211,"title":["Grounding of Textual Phrases in Images by Reconstruction"],"prefix":"10.1007","author":[{"given":"Anna","family":"Rohrbach","sequence":"first","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Ronghang","family":"Hu","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Trevor","family":"Darrell","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[],"role":[{"vocabulary":"crossref","role":"author"}]}],"member":"297","published-online":{"date-parts":[[2016,9,17]]},"reference":[{"key":"49_CR1","unstructured":"Ammar, W., Dyer, C., Smith, N.A.: Conditional random field autoencoders for unsupervised structured prediction. In: Advances in Neural Information Processing Systems (NIPS) (2014)"},{"key":"49_CR2","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"49_CR3","unstructured":"Blaschko, M., Vedaldi, A., Zisserman, A.: Simultaneous object detection and ranking with weak supervision. In: Advances in Neural Information Processing Systems (NIPS), pp. 235\u2013243 (2010)"},{"key":"49_CR4","doi-asserted-by":"crossref","unstructured":"Chen, X., Gupta, A.: Webly supervised learning of convolutional networks. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.168"},{"key":"49_CR5","doi-asserted-by":"crossref","unstructured":"Chen, X., Zitnick, C.L.: Mind\u2019s eye: a recurrent visual representation for image caption generation. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"49_CR6","doi-asserted-by":"crossref","unstructured":"Cinbis, R.G., Verbeek, J., Schmid, C.: Multi-fold MIL training for weakly supervised object localization. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.309"},{"key":"49_CR7","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.J., Li, K., Fei-Fei, L.: Imagenet: a large-scale hierarchical image database. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"49_CR8","doi-asserted-by":"crossref","unstructured":"Divvala, S., Farhadi, A., Guestrin, C.: Learning everything about anything: Webly-supervised visual concept learning. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2014)","DOI":"10.1109\/CVPR.2014.412"},{"key":"49_CR9","doi-asserted-by":"crossref","unstructured":"Donahue, J., Hendricks, L.A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.21236\/ADA623249"},{"issue":"2","key":"49_CR10","doi-asserted-by":"publisher","first-page":"303","DOI":"10.1007\/s11263-009-0275-4","volume":"88","author":"M Everingham","year":"2010","unstructured":"Everingham, M., Van Gool, L., Williams, C.K., Winn, J., Zisserman, A.: The Pascal Visual Object Classes (VOC) challenge. Int. J. Comput. Vis. (IJCV) 88(2), 303\u2013338 (2010)","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"49_CR11","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/7287.001.0001","volume-title":"WordNet: An Electronical Lexical Database","author":"C Fellbaum","year":"1998","unstructured":"Fellbaum, C.: WordNet: An Electronical Lexical Database. The MIT Press, Cambridge (1998)"},{"key":"49_CR12","doi-asserted-by":"crossref","unstructured":"Girshick, R.: Fast R-CNN. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.169"},{"key":"49_CR13","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: International Conference on Artificial Intelligence and Statistics, pp. 249\u2013256 (2010)"},{"key":"49_CR14","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"529","DOI":"10.1007\/978-3-319-10593-2_35","volume-title":"Computer Vision \u2013 ECCV 2014","author":"Y Gong","year":"2014","unstructured":"Gong, Y., Wang, L., Hodosh, M., Hockenmaier, J., Lazebnik, S.: Improving image-sentence embeddings using large weakly annotated photo collections. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part IV. LNCS, vol. 8692, pp. 529\u2013545. Springer, Switzerland (2014)"},{"key":"49_CR15","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Rodner, E., Saenko, K., Zhang, N., Farrell, R., Donahue, J., Darrell, T.: Open-vocabulary object retrieval. In: Robotics: Science and Systems (2014)","DOI":"10.15607\/RSS.2014.X.041"},{"key":"49_CR16","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Delving deep into rectifiers: surpassing human-level performance on imagenet classification. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/ICCV.2015.123"},{"issue":"8","key":"49_CR17","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"49_CR18","doi-asserted-by":"crossref","unstructured":"Hu, R., Xu, H., Rohrbach, M., Feng, J., Saenko, K., Darrell, T.: Natural language object retrieval. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.493"},{"key":"49_CR19","unstructured":"Ioffe, S., Szegedy, C.: Batch normalization: accelerating deep network training by reducing internal covariate shift. \n                      arXiv:1502.03167\n                      \n                     (2015)"},{"key":"49_CR20","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., Darrell, T.: Caffe: convolutional architecture for fast feature embedding. In: Proceedings of the ACM International Conference on Multimedia, pp. 675\u2013678. ACM (2014)","DOI":"10.1145\/2647868.2654889"},{"key":"49_CR21","unstructured":"Jin, J., Fu, K., Cui, R., Sha, F., Zhang, C.: Aligning where to see and what to tell: image caption with region-based attention and scene factorization. \n                      arXiv:1506.06272\n                      \n                     (2015)"},{"key":"49_CR22","doi-asserted-by":"crossref","unstructured":"Johnson, J., Krishna, R., Stark, M., Li, L.J., Shamma, D., Bernstein, M., Fei-Fei, L.: Image retrieval using scene graphs. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3668\u20133678 (2015)","DOI":"10.1109\/CVPR.2015.7298990"},{"key":"49_CR23","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"253","DOI":"10.1007\/978-3-319-10599-4_17","volume-title":"Computer Vision \u2013 ECCV 2014","author":"A Joulin","year":"2014","unstructured":"Joulin, A., Tang, K., Fei-Fei, L.: Efficient image and video co-localization with Frank-Wolfe algorithm. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part VI. LNCS, vol. 8694, pp. 253\u2013268. Springer, Heidelberg (2014)"},{"key":"49_CR24","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"49_CR25","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.: Deep fragment embeddings for bidirectional image sentence mapping. In: Advances in Neural Information Processing Systems (NIPS) (2014)"},{"key":"49_CR26","doi-asserted-by":"crossref","unstructured":"Kazemzadeh, S., Ordonez, V., Matten, M., Berg, T.L.: Referit game: referring to objects in photographs of natural scenes. In: Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP) (2014)","DOI":"10.3115\/v1\/D14-1086"},{"key":"49_CR27","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. \n                      arXiv:1412.6980\n                      \n                     (2014)"},{"key":"49_CR28","doi-asserted-by":"crossref","unstructured":"Kong, C., Lin, D., Bansal, M., Urtasun, R., Fidler, S.: What are you talking about? Text-to-image coreference. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 3558\u20133565. IEEE (2014)","DOI":"10.1109\/CVPR.2014.455"},{"key":"49_CR29","doi-asserted-by":"crossref","first-page":"193","DOI":"10.1162\/tacl_a_00220","volume":"1","author":"J Krishnamurthy","year":"2013","unstructured":"Krishnamurthy, J., Kollar, T.: Jointly learning to parse and perceive: connecting natural language to the physical world. Trans. Assoc. Comput. Linguist. (TACL) 1, 193\u2013206 (2013)","journal-title":"Trans. Assoc. Comput. Linguist. (TACL)"},{"key":"49_CR30","doi-asserted-by":"crossref","unstructured":"Kwak, S., Cho, M., Laptev, I., Ponce, J., Schmid, C.: Unsupervised object discovery and tracking in video collections. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.363"},{"key":"49_CR31","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Kong, C., Urtasun, R.: Visual semantic search: retrieving videos via complex textual queries. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 2657\u20132664. IEEE (2014)","DOI":"10.1109\/CVPR.2014.340"},{"key":"49_CR32","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V. LNCS, vol. 8693, pp. 740\u2013755. Springer, Switzerland (2014)"},{"key":"49_CR33","doi-asserted-by":"crossref","unstructured":"Mao, J., Huang, J., Toshev, A., Camburu, O., Yuille, A., Murphy, K.: Generation and comprehension of unambiguous object descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.9"},{"key":"49_CR34","unstructured":"Matuszek, C., Fitzgerald, N., Zettlemoyer, L., Bo, L., Fox, D.: A joint model of language and perception for grounded attribute learning. In: Proceedings of the International Conference on Machine Learning (ICML) (2012)"},{"key":"49_CR35","doi-asserted-by":"crossref","unstructured":"Plummer, B., Wang, L., Cervantes, C., Caicedo, J., Hockenmaier, J., Lazebnik, S.: Flickr30k entities: collecting region-to-phrase correspondences for richer image-to-sentence models. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.303"},{"key":"49_CR36","doi-asserted-by":"crossref","unstructured":"Sadeghi, F., Divvala, S.K., Farhadi, A.: Viske: visual knowledge extraction and question answering by visual verification of relation phrases. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298752"},{"key":"49_CR37","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"49_CR38","unstructured":"Song, H.O., Girshick, R., Jegelka, S., Mairal, J., Harchaoui, Z., Darrell, T.: On learning to localize objects with minimal supervision. \n                      arXiv:1403.1024\n                      \n                     (2014)"},{"key":"49_CR39","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: Advances in Neural Information Processing Systems (NIPS), pp. 3104\u20133112 (2014)"},{"key":"49_CR40","doi-asserted-by":"crossref","unstructured":"Tang, K., Joulin, A., Li, L.J., Fei-Fei, L.: Co-localization in real-world images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR). IEEE (2014)","DOI":"10.1109\/CVPR.2014.190"},{"key":"49_CR41","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., B\u00e4uml, M., Stiefelhagen, R.: Book2movie: aligning video scenes with book chapters. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp. 1827\u20131835 (2015)","DOI":"10.1109\/CVPR.2015.7298792"},{"issue":"2","key":"49_CR42","doi-asserted-by":"publisher","first-page":"154","DOI":"10.1007\/s11263-013-0620-5","volume":"104","author":"JR Uijlings","year":"2013","unstructured":"Uijlings, J.R., van de Sande, K.E., Gevers, T., Smeulders, A.W.: Selective search for object recognition. Int. J. Comput. Vis. (IJCV) 104(2), 154\u2013171 (2013)","journal-title":"Int. J. Comput. Vis. (IJCV)"},{"key":"49_CR43","doi-asserted-by":"crossref","unstructured":"Vincent, P., Larochelle, H., Bengio, Y., Manzagol, P.A.: Extracting and composing robust features with denoising autoencoders. In: Proceedings of the International Conference on Machine Learning (ICML) (2008)","DOI":"10.1145\/1390156.1390294"},{"key":"49_CR44","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"49_CR45","doi-asserted-by":"crossref","unstructured":"Wang, L., Li, Y., Lazebnik, S.: Learning deep structure-preserving image-text embeddings. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) (2016)","DOI":"10.1109\/CVPR.2016.541"},{"key":"49_CR46","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: Proceedings of the International Conference on Machine Learning (ICML) (2015)"},{"key":"49_CR47","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"49_CR48","unstructured":"Yeung, S., Russakovsky, O., Jin, N., Andriluka, M., Mori, G., Fei-Fei, L.: Every moment counts: dense detailed labeling of actions in complex videos. \n                      arXiv:1507.05738\n                      \n                     (2015)"},{"key":"49_CR49","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014)","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"49_CR50","unstructured":"Yu, H., Siskind, J.M.: Grounded language learning from video described with sentences. In: Proceedings of the Annual Meeting of the Association for Computational Linguistics (ACL), pp. 53\u201363 (2013)"},{"key":"49_CR51","unstructured":"Yu, H., Siskind, J.M.: Sentence directed video object codetection. \n                      arXiv:1506.02059\n                      \n                     (2015)"},{"key":"49_CR52","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., Fidler, S.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: Proceedings of the IEEE International Conference on Computer Vision (ICCV) (2015)","DOI":"10.1109\/ICCV.2015.11"},{"key":"49_CR53","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"crossref","first-page":"391","DOI":"10.1007\/978-3-319-10602-1_26","volume-title":"Computer Vision \u2013 ECCV 2014","author":"CL Zitnick","year":"2014","unstructured":"Zitnick, C.L., Doll\u00e1r, P.: Edge boxes: locating object proposals from edges. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014, Part V. LNCS, vol. 8693, pp. 391\u2013405. Springer, Switzerland (2014)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46448-0_49","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2020,10,10]],"date-time":"2020-10-10T00:18:07Z","timestamp":1602289087000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46448-0_49"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319464473","9783319464480"],"references-count":53,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46448-0_49","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"17 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}