{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,3,17]],"date-time":"2026-03-17T18:26:59Z","timestamp":1773772019818,"version":"3.50.1"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319249469","type":"print"},{"value":"9783319249476","type":"electronic"}],"license":[{"start":{"date-parts":[[2015,1,1]],"date-time":"2015-01-01T00:00:00Z","timestamp":1420070400000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by-nc\/2.5"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2015]]},"DOI":"10.1007\/978-3-319-24947-6_17","type":"book-chapter","created":{"date-parts":[[2015,10,6]],"date-time":"2015-10-06T18:10:30Z","timestamp":1444155030000},"page":"209-221","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":70,"title":["The Long-Short Story of Movie Description"],"prefix":"10.1007","author":[{"given":"Anna","family":"Rohrbach","sequence":"first","affiliation":[]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2015,11,3]]},"reference":[{"key":"17_CR1","unstructured":"Barbu, A., Bridge, A., Burchill, Z., Coroian, D., Dickinson, S., Fidler, S., Michaux, A., Mussman, S., Narayanaswamy, S., Salvi, D., Schmidt, L., Shangguan, J., Siskind, J.M., Waggoner, J., Wang, S., Wei, J., Yin, Y., Zhang, Z.: Video in sentences out. In: UAI (2012)"},{"key":"17_CR2","unstructured":"Chen, D., Dolan, W.: Collecting highly parallel data for paraphrase evaluation. In: ACL (2011)"},{"key":"17_CR3","unstructured":"Chen, X., Fang, H., Lin, T., Vedantam, R., Gupta, S., Dollr, P., Zitnick, C.L.: Microsoft coco captions: data collection and evaluation server (2015). arXiv:1504.00325"},{"key":"17_CR4","doi-asserted-by":"crossref","unstructured":"Das, P., Xu, C., Doell, R., Corso, J.: Thousand frames in just a few words: lingual description of videos through latent topics and sparse object stitching. In: CVPR (2013)","DOI":"10.1109\/CVPR.2013.340"},{"key":"17_CR5","doi-asserted-by":"crossref","unstructured":"Devlin, J., Cheng, H., Fang, H., Gupta, S., Deng, L., He, X., Zweig, G., Mitchell, M.: Language models for image captioning: the quirks and what works (2015). arXiv:1505.01809","DOI":"10.3115\/v1\/P15-2017"},{"key":"17_CR6","doi-asserted-by":"crossref","unstructured":"Donahue, J., Hendricks, L.A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: CVPR (2015)","DOI":"10.21236\/ADA623249"},{"key":"17_CR7","doi-asserted-by":"crossref","unstructured":"Elliott, D., Keller, F.: Image description using visual dependency representations. In: EMNLP, pp. 1292\u20131302 (2013)","DOI":"10.18653\/v1\/D13-1128"},{"key":"17_CR8","doi-asserted-by":"crossref","unstructured":"Fang, H., Gupta, S., Iandola, F.N., Srivastava, R., Deng, L., Doll\u00e1r, P., Gao, J., He, X., Mitchell, M., Platt, J.C., Zitnick, C.L., Zweig, G.: From captions to visual concepts and back. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"17_CR9","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"15","DOI":"10.1007\/978-3-642-15561-1_2","volume-title":"Computer Vision \u2013 ECCV 2010","author":"A Farhadi","year":"2010","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: generating sentences from images. In: Daniilidis, K., Maragos, P., Paragios, N. (eds.) ECCV 2010, Part IV. LNCS, vol. 6314, pp. 15\u201329. Springer, Heidelberg (2010)"},{"key":"17_CR10","doi-asserted-by":"publisher","DOI":"10.7551\/mitpress\/7287.001.0001","volume-title":"WordNet: An Electronic Lexical Database","author":"C Fellbaum","year":"1998","unstructured":"Fellbaum, C.: WordNet: An Electronic Lexical Database. The MIT Press, Cambridge (1998)"},{"key":"17_CR11","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., Saenko, K.: Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shoot recognition. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"17_CR12","unstructured":"Hinton, G.E., Srivastava, N., Krizhevsky, A., Sutskever, I., Salakhutdinov, R.R.: Improving neural networks by preventing co-adaptation of feature detectors (2012). arXiv:1207.0580"},{"issue":"8","key":"17_CR13","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"17_CR14","unstructured":"Hoffman, J., Guadarrama, S., Tzeng, E., Donahue, J., Girshick, R., Darrell, T., Saenko, K.: LSDA: large scale detection through adaptation. In: NIPS (2014)"},{"key":"17_CR15","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., Darrell, T.: Caffe: Convolutional architecture for fast feature embedding (2014). arXiv:1408.5093","DOI":"10.1145\/2647868.2654889"},{"key":"17_CR16","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"17_CR17","unstructured":"Kiros, R., Salakhutdinov, R., Zemel, R.S.: Unifying visual-semantic embeddings with multimodal neural language models. TACL (2015)"},{"issue":"2","key":"17_CR18","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. IJCV 50(2), 171\u2013184 (2002)","journal-title":"IJCV"},{"key":"17_CR19","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., Li, S., Choi, Y., Berg, A.C., Berg, T.L.: Baby talk: understanding and generating simple image descriptions. In: CVPR (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"17_CR20","unstructured":"Kuznetsova, P., Ordonez, V., Berg, T.L., Hill, U.C., Choi, Y.: Treetalk: composition and compression of trees for image descriptions. In: TACL (2014)"},{"key":"17_CR21","unstructured":"Lavie, M.D.A.: Meteor universal: language specific translation evaluation for any target language. In: ACL 2014, p. 376 (2014)"},{"key":"17_CR22","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., Yuille, A.: Deep captioning with multimodal recurrent neural networks (m-RNN). In: ICLR (2015)"},{"key":"17_CR23","unstructured":"Mitchell, M., Dodge, J., Goyal, A., Yamaguchi, K., Stratos, K., Han, X., Mensch, A., Berg, A.C., Berg, T.L., Daume III, H.: Midge: generating image descriptions from computer vision detections. In: EACL (2012)"},{"key":"17_CR24","doi-asserted-by":"crossref","unstructured":"Pan, Y., Mei, T., Yao, T., Li, H., Rui, Y.: Jointly modeling embedding and translation to bridge video and language (2015). arXiv:1505.01861","DOI":"10.1109\/CVPR.2016.497"},{"key":"17_CR25","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: BLEU: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"17_CR26","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"184","DOI":"10.1007\/978-3-319-11752-2_15","volume-title":"Pattern Recognition","author":"A Rohrbach","year":"2014","unstructured":"Rohrbach, A., Rohrbach, M., Qiu, W., Friedrich, A., Pinkal, M., Schiele, B.: Coherent multi-sentence video description with variable level of detail. In: Jiang, X., Hornegger, J., Koch, R. (eds.) GCPR 2014. LNCS, vol. 8753, pp. 184\u2013195. Springer, Heidelberg (2014)"},{"key":"17_CR27","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Schiele, B.: The long-short story of movie description (2015). arXiv:1506.01698","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"17_CR28","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"17_CR29","doi-asserted-by":"crossref","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B.: Translating video content to natural language descriptions. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.61"},{"key":"17_CR30","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., Mooney, R.J.: Integrating language and vision to generate natural language descriptions of videos in the wild. In: COLING (2014)"},{"key":"17_CR31","unstructured":"Torabi, A., Pal, C., Larochelle, H., Courville, A.: Using descriptive video services to create a large data source for video annotation research (2015). arXiv:1503.01070v1"},{"key":"17_CR32","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: Cider: Consensus-based image description evaluation. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"17_CR33","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence - video to text (2015). arXiv:1505.00487","DOI":"10.1109\/ICCV.2015.515"},{"key":"17_CR34","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: NAACL (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"17_CR35","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: A neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"17_CR36","doi-asserted-by":"crossref","unstructured":"Wang, H., Schmid, C.: Action recognition with improved trajectories. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.441"},{"key":"17_CR37","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., Corso, J.J.: Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: AAAI (2015)","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"17_CR38","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure (2015). arXiv:1502.08029v4","DOI":"10.1109\/ICCV.2015.512"},{"key":"17_CR39","doi-asserted-by":"publisher","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., Hockenmaier, J.: From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. TACL 2, 67\u201378 (2014)","journal-title":"TACL"},{"key":"17_CR40","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., Oliva, A.: Learning Deep Features for Scene Recognition using Places Database. In: NIPS (2014)"}],"container-title":["Lecture Notes in Computer Science","Pattern Recognition"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-24947-6_17","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,5,30]],"date-time":"2025-05-30T22:50:51Z","timestamp":1748645451000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-319-24947-6_17"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2015]]},"ISBN":["9783319249469","9783319249476"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-24947-6_17","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2015]]},"assertion":[{"value":"3 November 2015","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}}]}}