{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,1,16]],"date-time":"2026-01-16T19:08:42Z","timestamp":1768590522342,"version":"3.49.0"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319466033","type":"print"},{"value":"9783319466040","type":"electronic"}],"license":[{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2016,1,1]],"date-time":"2016-01-01T00:00:00Z","timestamp":1451606400000},"content-version":"vor","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2016]]},"DOI":"10.1007\/978-3-319-46604-0_46","type":"book-chapter","created":{"date-parts":[[2016,9,17]],"date-time":"2016-09-17T03:31:55Z","timestamp":1474083115000},"page":"651-667","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":52,"title":["Learning Joint Representations of Videos and Sentences with Web Image Search"],"prefix":"10.1007","author":[{"given":"Mayu","family":"Otani","sequence":"first","affiliation":[]},{"given":"Yuta","family":"Nakashima","sequence":"additional","affiliation":[]},{"given":"Esa","family":"Rahtu","sequence":"additional","affiliation":[]},{"given":"Janne","family":"Heikkil\u00e4","sequence":"additional","affiliation":[]},{"given":"Naokazu","family":"Yokoya","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2016,9,18]]},"reference":[{"key":"46_CR1","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL, pp. 190\u2013200 (2011)"},{"key":"46_CR2","unstructured":"Chen, X., Fang, H., Lin, T., Vedantam, R., Gupta, S., Dollr, P., Zitnick, C.L.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 , 7 pages (2015)"},{"key":"46_CR3","doi-asserted-by":"crossref","unstructured":"Chopra, S., Hadsell, R., LeCun, Y.: Learning a similarity metric discriminatively, with application to face verification. In: CVPR, pp. 539\u2013546 (2005)","DOI":"10.1109\/CVPR.2005.202"},{"issue":"2","key":"46_CR4","doi-asserted-by":"publisher","first-page":"5: 1","DOI":"10.1145\/1348246.1348248","volume":"40","author":"R Datta","year":"2008","unstructured":"Datta, R., Joshi, D., Li, J., Wang, J.Z.: Image retrieval: ideas, influences, and trends of the new age. ACM Comput. Surv. 40(2), 5: 1\u20135: 60 (2008)","journal-title":"ACM Comput. Surv."},{"key":"46_CR5","unstructured":"Donahue, J., Jia, Y., Vinyals, O., Hoffman, J., Zhang, N., Tzeng, E., Darrell, T.: DeCAF: a deep convolutional activation feature for generic visual recognition. In: ICML, pp. 647\u2013655 (2014)"},{"key":"46_CR6","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F., Srivastava, R.K., Deng, L., Dollar, P., Gao, J., He, X., Mitchell, M., Platt, J.C., Zitnick, C.L., Zweig, G.: From captions to visual concepts and back. In: CVPR, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"46_CR7","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010). doi: 10.1007\/978-3-642-15561-1_2"},{"key":"46_CR8","doi-asserted-by":"crossref","unstructured":"Fergus, R., Fei-Fei, L., Perona, P., Zisserman, A.: Learning object categories from Google\u2019s image search. In: ICCV, pp. 1816\u20131823 (2005)","DOI":"10.1109\/ICCV.2005.142"},{"key":"46_CR9","unstructured":"Frome, A., Corrado, G.S., Shlens, J., Bengio, S., Dean, J., Ranzato, M.A., Mikolov, T.: DeViSE: a deep visual-semantic embedding model. In: NIPS, pp. 2121\u20132129 (2013)"},{"key":"46_CR10","doi-asserted-by":"crossref","unstructured":"Girshick, R., Donahue, J., Darrell, T., Berkeley, U.C., Malik, J.: Rich feature hierarchies for accurate object detection and semantic segmentation. In: CVPR, pp. 580\u2013587 (2014)","DOI":"10.1109\/CVPR.2014.81"},{"key":"46_CR11","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Venugopalan, S., Austin, U.T., Krishnamoorthy, N., Mooney, R., Malkarnenkar, G., Darrell, T., Berkeley, U.C.: YouTube2Text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: ICCV, pp. 2712\u20132719 (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"46_CR12","doi-asserted-by":"crossref","unstructured":"Gygli, M., Grabner, H., Van Gool, L.: Video summarization by learning submodular mixtures of objectives. In: CVPR, pp. 3090\u20133098 (2015)","DOI":"10.1109\/CVPR.2015.7298928"},{"key":"46_CR13","doi-asserted-by":"crossref","unstructured":"Johnson, J., Ballan, L., Fei-Fei, L.: Love thy neighbors: image annotation by exploiting image metadata. In: ICCV, pp. 4624\u20134632 (2015)","DOI":"10.1109\/ICCV.2015.525"},{"key":"46_CR14","unstructured":"Karpathy, A., Joulin, A., Fei-Fei, L.: Deep fragment embeddings for bidirectional image sentence mapping. In: NIPS, pp. 1889\u20131897 (2014)"},{"key":"46_CR15","doi-asserted-by":"crossref","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. In: ICLR, 11 pages (2015)","DOI":"10.1201\/b19256-4"},{"key":"46_CR16","unstructured":"Kiros, R., Zhu, Y., Salakhutdinov, R.R., Zemel, R., Urtasun, R., Torralba, A., Fidler, S.: Skip-thought vectors. In: NIPS, pp. 3276\u20133284 (2015)"},{"key":"46_CR17","unstructured":"Le, Q.V., Mikolov, T.: Distributed representations of sentences and documents. In: ICML, pp. 1188\u20131196 (2014)"},{"key":"46_CR18","doi-asserted-by":"crossref","unstructured":"Lin, D., Fidler, S., Kong, C., Urtasun, R.: Visual semantic search: retrieving videos via complex textual queries. In: CVPR, pp. 2657\u20132664 (2014)","DOI":"10.1109\/CVPR.2014.340"},{"key":"46_CR19","doi-asserted-by":"crossref","unstructured":"Lin, T.Y., Belongie, S., Hays, J.: Learning deep representations for ground-to-aerial geolocalization. In: CVPR, pp. 5007\u20135015 (2015)","DOI":"10.1109\/CVPR.2015.7299135"},{"issue":"6","key":"46_CR20","doi-asserted-by":"publisher","first-page":"797","DOI":"10.1109\/TSMCC.2011.2109710","volume":"41","author":"S Maybank","year":"2011","unstructured":"Maybank, S.: A survey on visual content-based video indexing and retrieval. IEEE Trans. Syst. Man Cybern. Part C (Appl. Rev.) 41(6), 797\u2013819 (2011)","journal-title":"IEEE Trans. Syst. Man Cybern. Part C (Appl. Rev.)"},{"key":"46_CR21","unstructured":"Ordonez, V., Kulkarni, G., Berg, T.: Im2Text: describing images using 1 million captioned photographs. In: NIPS, pp. 1143\u20131151 (2011)"},{"key":"46_CR22","unstructured":"Rashtchian, C., Young, P., Hodosh, M., Hockenmaier, J.: Collecting image annotations using Amazon\u2019s mechanical turk. In: NAACL-HLT, pp. 139\u2013147 (2010)"},{"key":"46_CR23","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: Towards real-time object detection with region proposal networks. In: NIPS, pp. 91\u201399 (2015)"},{"key":"46_CR24","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B.: Translating video content to natural language descriptions. In: ICCV, pp. 433\u2013440 (2013)","DOI":"10.1109\/ICCV.2013.61"},{"issue":"3","key":"46_CR25","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: ImageNet large scale visual recognition challenge. Int. J. Comput. Vis. 115(3), 211\u2013252 (2015)","journal-title":"Int. J. Comput. Vis."},{"key":"46_CR26","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recoginition. In: ICLR, p. 14 (2015)"},{"key":"46_CR27","unstructured":"Socher, R., Ganjoo, M., Manning, C.D., Ng, A.Y.: Zero-shot learning through cross-modal transfer. In: NIPS, pp. 935\u2013943 (2013)"},{"key":"46_CR28","doi-asserted-by":"crossref","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: TVSum: summarizing web videos using titles. In: CVPR, pp. 5179\u20135187 (2015)","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"46_CR29","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: CVPR, pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"46_CR30","unstructured":"Tokui, S., Oono, K., Hido, S., Clayton, J.: Chainer: a next-generation open source framework for deep learning. In: NIPS, 6 pages (2015)"},{"key":"46_CR31","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: NAACL-HLT, pp. 1494\u20131504 (2014)","DOI":"10.3115\/v1\/N15-1173"},{"key":"46_CR32","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR, pp. 3156\u20133164 (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"46_CR33","doi-asserted-by":"crossref","unstructured":"Wang, X., Gupta, A.: Unsupervised learning of visual representations using videos. In: ICCV, pp. 2794\u20132802 (2015)","DOI":"10.1109\/ICCV.2015.320"},{"key":"46_CR34","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., Corso, J.: Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: AAAI, pp. 2346\u20132352 (2015)","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"46_CR35","doi-asserted-by":"crossref","unstructured":"Yao, L., Ballas, N., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: ICCV, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"46_CR36","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: new similarity metrics for semantic inference over event descriptions. Trans. Assoc. Comput. Linguist. 2, 67\u201378 (2014). https:\/\/tacl2013.cs.columbia.edu\/ojs\/index.php\/tacl\/article\/view\/229","journal-title":"Trans. Assoc. Comput. Linguist."},{"key":"46_CR37","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., Fidler, S.: Aligning books and movies: towards story-like visual explanations by watching movies and reading books. In: IEEE International Conference on Computer Vision (ICCV), pp. 19\u201327 (2015)","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2016 Workshops"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-46604-0_46","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,10]],"date-time":"2025-06-10T19:36:36Z","timestamp":1749584196000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-46604-0_46"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2016]]},"ISBN":["9783319466033","9783319466040"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-46604-0_46","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2016]]},"assertion":[{"value":"18 September 2016","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2016","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 October 2016","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"16 October 2016","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2016","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/www.eccv2016.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}