{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,22]],"date-time":"2026-04-22T17:49:24Z","timestamp":1776880164560,"version":"3.51.2"},"reference-count":106,"publisher":"Springer Science and Business Media LLC","issue":"1","license":[{"start":{"date-parts":[[2017,1,25]],"date-time":"2017-01-25T00:00:00Z","timestamp":1485302400000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/creativecommons.org\/licenses\/by\/4.0"}],"funder":[{"name":"German Academic Exchange Service (DAAD)","award":["FITweltweit"],"award-info":[{"award-number":["FITweltweit"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Int J Comput Vis"],"published-print":{"date-parts":[[2017,5]]},"DOI":"10.1007\/s11263-016-0987-1","type":"journal-article","created":{"date-parts":[[2017,1,25]],"date-time":"2017-01-25T07:38:51Z","timestamp":1485329931000},"page":"94-120","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":223,"title":["Movie Description"],"prefix":"10.1007","volume":"123","author":[{"given":"Anna","family":"Rohrbach","sequence":"first","affiliation":[]},{"given":"Atousa","family":"Torabi","sequence":"additional","affiliation":[]},{"given":"Marcus","family":"Rohrbach","sequence":"additional","affiliation":[]},{"given":"Niket","family":"Tandon","sequence":"additional","affiliation":[]},{"given":"Christopher","family":"Pal","sequence":"additional","affiliation":[]},{"given":"Hugo","family":"Larochelle","sequence":"additional","affiliation":[]},{"given":"Aaron","family":"Courville","sequence":"additional","affiliation":[]},{"given":"Bernt","family":"Schiele","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2017,1,25]]},"reference":[{"key":"987_CR1","doi-asserted-by":"publisher","unstructured":"Anderson, P., Fernando, B., Johnson, M., & Gould, S. (2016). Spice: Semantic propositional image caption evaluation. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-46454-1_24"},{"key":"987_CR2","unstructured":"Baker, C.\u00a0F., Fillmore, C.\u00a0J., & Lowe, J.\u00a0B. (1998). The berkeley framenet project. In Proceedings of the annual meeting of the association for computational linguistics (ACL)."},{"key":"987_CR3","unstructured":"Ballas, N., Yao, L., Pal, C., & Courville, A. (2016). Delving deeper into convolutional networks for learning video representations. In International conference on learning representations (ICLR)."},{"key":"987_CR4","unstructured":"Barbu, A., Bridge, A., Burchill, Z., Coroian, D., Dickinson, S., Fidler, S. et al. (2012). Video in sentences out. In Proceedings of the conference on Uncertainty in artificial intelligence (UAI)."},{"key":"987_CR5","doi-asserted-by":"publisher","unstructured":"Bojanowski, P., Bach, F., Laptev, I., Ponce, J., Schmid, C., & Sivic, J. (2013). Finding actors and actions in movies. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2013.283"},{"key":"987_CR6","doi-asserted-by":"publisher","unstructured":"Bojanowski, P., Lajugie, R., Bach, F., Laptev, I., Ponce, J., Schmid, C., & Sivic, J. (2014). Weakly supervised action labeling in videos under ordering constraints. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-10602-1_41"},{"key":"987_CR7","doi-asserted-by":"publisher","unstructured":"Bruni, M., Uricchio, T., Seidenari, L., & Del\u00a0Bimbo, A. (2016). Do textual descriptions help action recognition? In Proceedings of the ACM on multimedia conference (MM), pp. 645\u2013649.","DOI":"10.1145\/2964284.2967301"},{"key":"987_CR8","unstructured":"Chen, D. & Dolan, W. (2011). Collecting highly parallel data for paraphrase evaluation. In Proceedings of the annual meeting of the association for computational linguistics (ACL)."},{"key":"987_CR9","doi-asserted-by":"publisher","unstructured":"Chen, X, & Zitnick, C.\u00a0L. (2015). Mind\u2019s eye: A recurrent visual representation for image caption generation. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298856"},{"key":"987_CR10","unstructured":"Chen, X., Fang, H., Lin, T.-Y., Vedantam, R., Gupta, S., Doll\u00e1r, P., & Zitnick, C.\u00a0L. (2015). Microsoft coco captions: Data collection and evaluation server. arXiv:1504.00325 ."},{"key":"987_CR11","doi-asserted-by":"publisher","unstructured":"Cour, T., Jordan, C., Miltsakaki, E., & Taskar, B. (2008). Movie\/script: Alignment and parsing of video and text transcription. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-540-88693-8_12"},{"key":"987_CR12","doi-asserted-by":"publisher","unstructured":"Cour, T., Sapp, B., Jordan, C., & Taskar, B. (2009). Learning from ambiguously labeled images. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2009.5206667"},{"key":"987_CR13","unstructured":"Das, D., Martins, A. F. T., & Smith, N. A. (2012). An exact dual decomposition algorithm for shallow semantic parsing with constraints. In Proceedings of the annual meeting of the association for computational linguistics (ACL)."},{"key":"987_CR14","doi-asserted-by":"publisher","unstructured":"Das, P., Xu, C., Doell, R., & Corso, J. (2013). Thousand frames in just a few words: Lingual description of videos through latent topics and sparse object stitching. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2013.340"},{"key":"987_CR15","doi-asserted-by":"publisher","unstructured":"de\u00a0Melo, G., & Tandon, N. (2016). Seeing is believing: The quest for multimodal knowledge. SIGWEB Newsletter, (Spring). doi: 10.1145\/2903513.2903517 .","DOI":"10.1145\/2903513.2903517"},{"key":"987_CR16","doi-asserted-by":"publisher","unstructured":"Del\u00a0Corro, L., & Gemulla, R. (2013). Clausie: Clause-based open information extraction. In Proceedings of the international world wide web conference (WWW).","DOI":"10.1145\/2488388.2488420"},{"key":"987_CR17","doi-asserted-by":"crossref","unstructured":"Deng, J., Dong, W., Socher, R., Li, L.-J., Li, K., & Fei-Fei, L. (2009). Imagenet: A large-scale hierarchical image database. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"987_CR18","doi-asserted-by":"publisher","unstructured":"Denkowski, M., & Lavie, A. (2014). Meteor universal: Language specific translation evaluation for any target language. In Proceedings of the ninth workshop on statistical machine translation.","DOI":"10.3115\/v1\/W14-3348"},{"key":"987_CR19","doi-asserted-by":"publisher","unstructured":"Devlin, J., Cheng, H., Fang, H., Gupta, S., Deng, L., He, X. et al. (2015). Language models for image captioning: The quirks and what works. In Proceedings of the annual meeting of the association for computational linguistics (ACL).","DOI":"10.3115\/v1\/P15-2017"},{"key":"987_CR20","doi-asserted-by":"publisher","unstructured":"Donahue, J., Hendricks, L.\u00a0A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K. et al. (2015). Long-term recurrent convolutional networks for visual recognition and description. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"987_CR21","doi-asserted-by":"publisher","unstructured":"Duchenne, O., Laptev, I., Sivic, J., Bach, F., & Ponce, J. (2009). Automatic annotation of human actions in video. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2009.5459279"},{"key":"987_CR22","doi-asserted-by":"crossref","unstructured":"Elliott, D., & Keller, F. (2013). Image description using visual dependency representations. In Proceedings of the conference on empirical methods in natural language processing (EMNLP), pp. 1292\u20131302.","DOI":"10.18653\/v1\/D13-1128"},{"key":"987_CR23","doi-asserted-by":"publisher","unstructured":"Everingham, M., Sivic, J., & Zisserman, A. (2006). \u201dhello! my name is... buffy\u201d\u2014Automatic naming of characters in tv video. In Proceedings of the british machine vision conference (BMVC).","DOI":"10.5244\/C.20.92"},{"key":"987_CR24","doi-asserted-by":"publisher","unstructured":"Fang, H., Gupta, S., Iandola, F.\u00a0N., Srivastava, R., Deng, L., Doll\u00e1r, P. et al. (2015). From captions to visual concepts and back. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"987_CR25","doi-asserted-by":"publisher","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J. et al. (2010). Every picture tells a story: Generating sentences from images. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"987_CR26","doi-asserted-by":"crossref","DOI":"10.7551\/mitpress\/7287.001.0001","volume-title":"WordNet: An electronic lexical database","author":"C Fellbaum","year":"1998","unstructured":"Fellbaum, C. (1998). WordNet: An electronic lexical database. Cambridge: The MIT Press."},{"key":"987_CR27","doi-asserted-by":"publisher","unstructured":"Gagnon, L., Chapdelaine, C., Byrns, D., Foucher, S., Heritier, M., & Gupta, V. (2010). A computer-vision-assisted system for videodescription scripting. In Proceedings of the IEEE conference on computer vision and pattern recognition workshops (CVPR workshops).","DOI":"10.1109\/CVPRW.2010.5543575"},{"key":"987_CR28","doi-asserted-by":"publisher","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T. et al. (2013). Youtube2text: Recognizing and describing arbitrary activities using semantic hierarchies and zero-shoot recognition. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2013.337"},{"key":"987_CR29","doi-asserted-by":"publisher","unstructured":"Hendricks, L.\u00a0A., Venugopalan, S., Rohrbach, M., Mooney, R., Saenko, K., & Darrell, T. (2016). Deep compositional captioning: Describing novel object categories without paired training data. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.8"},{"key":"987_CR30","unstructured":"Hinton, G.\u00a0E., Srivastava, N., Krizhevsky, A., Sutskever, I., & Salakhutdinov, R.\u00a0R. (2012). Improving neural networks by preventing co-adaptation of feature detectors. arXiv:1207.0580 ."},{"issue":"8","key":"987_CR31","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., & Schmidhuber, J. (1997). Long short-term memory. Neural Computation, 9(8), 1735\u20131780.","journal-title":"Neural Computation"},{"key":"987_CR32","unstructured":"Hoffman, J., Guadarrama, S., Tzeng, E., Donahue, J., Girshick, R., Darrell, T., & Saenko, K. (2014). LSDA: Large scale detection through adaptation. In Conference on neural information processing systems (NIPS)."},{"key":"987_CR33","doi-asserted-by":"publisher","unstructured":"Karpathy, A., & Fei-Fei, L. (2015). Deep visual-semantic alignments for generating image descriptions. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"987_CR34","unstructured":"Kipper, K., Korhonen, A., Ryant, N., & Palmer, M. (2006). Extending verbnet with novel verb classes. In Proceedings of the international conference on language resources and evaluation (LREC)."},{"key":"987_CR35","unstructured":"Kiros, R., Salakhutdinov, R., & Zemel, R. (2014). Multimodal neural language models. In International conference on machine learning (ICML)."},{"key":"987_CR36","first-page":"595","volume":"14","author":"R Kiros","year":"2015","unstructured":"Kiros, R., Salakhutdinov, R., & Zemel, R. S. (2015). Unifying visual-semantic embeddings with multimodal neural language models. Transactions of the Association for Computational Linguistics (TACL), 14, 595\u2013603.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"key":"987_CR37","doi-asserted-by":"publisher","unstructured":"Klein, B., Lev, G., Sadeh, G., & Wolf, L. (2015). Associating neural word embeddings with deep image representations using fisher vectors. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7299073"},{"key":"987_CR38","doi-asserted-by":"publisher","unstructured":"Koehn, P., Hoang, H., Birch, A., Callison-Burch, C., Federico, M., Bertoldi, N. et al. (2007). Moses: Open source toolkit for statistical machine translation. In Proceedings of the annual meeting of the association for computational linguistics (ACL).","DOI":"10.3115\/1557769.1557821"},{"issue":"2","key":"987_CR39","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., & Fukunaga, K. (2002). Natural language description of human activities from video images based on concept hierarchy of actions. International Journal of Computer Vision (IJCV), 50(2), 171\u2013184.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"987_CR40","unstructured":"Krizhevsky, A., Sutskever, I., & Hinton, G. E. (2012). Imagenet classification with deep convolutional neural networks. In Conference on neural information processing systems (NIPS)."},{"key":"987_CR41","doi-asserted-by":"publisher","unstructured":"Kulkarni, G., Premraj, V., Dhar, S., Li, S., Choi, Y., Berg, A.\u00a0C., & Berg, T.\u00a0L. (2011). Baby talk: Understanding and generating simple image descriptions. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"987_CR42","unstructured":"Kuznetsova, P., Ordonez, V., Berg, A.\u00a0C., Berg, T.\u00a0L., & Choi, Y. (2012). Collective generation of natural image descriptions. In Proceedings of the annual meeting of the association for computational linguistics (ACL)."},{"key":"987_CR43","doi-asserted-by":"crossref","unstructured":"Kuznetsova, P., Ordonez, V., Berg, T.\u00a0L., Hill, UNC\u00a0Chapel, & Choi, Y. (2014). Treetalk: Composition and compression of trees for image descriptions. In Proceedings of the Transactions of the association for computational linguistics (TACL).","DOI":"10.1162\/tacl_a_00188"},{"key":"987_CR44","unstructured":"Lakritz, J. & Salway, A. (2006). The semi-automatic generation of audio description from screenplays. Technical report, Department of Computing Technical Report, University of Surrey."},{"key":"987_CR45","doi-asserted-by":"publisher","unstructured":"Laptev, I., Marszalek, M., Schmid, C., & Rozenfeld, B. (2008). Learning realistic human actions from movies. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2008.4587756"},{"key":"987_CR46","unstructured":"Lev, G., Sadeh, G., Klein, B., & Wolf, L. (2015). RNN fisher vectors for action recognition and image annotation. In European conference on computer vision (ECCV)."},{"key":"987_CR47","doi-asserted-by":"publisher","unstructured":"Li, G., Ma, S., & Han, Y. (2015). Summarization-based video caption via deep neural networks. In Proceedings of the 23rd annual ACM conference on multimedia conference.","DOI":"10.1145\/2733373.2806314"},{"key":"987_CR48","unstructured":"Li, S., Kulkarni, G., Berg, T.\u00a0L., Berg, A.\u00a0C., & Choi, Y. (2011). Composing simple image descriptions using web-scale N-grams. In Proceedings of the fifteenth conference on computational natural language learning (CoNLL). Association for Computational Linguistics."},{"key":"987_CR49","doi-asserted-by":"publisher","unstructured":"Li, Y., Song, Y., Cao, L., Tetreault, J., Goldberg, L., Jaimes, A., & Luo, J. (2016). TGIF: A new dataset and benchmark on animated GIF description. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.502"},{"key":"987_CR50","doi-asserted-by":"publisher","unstructured":"Liang, C., Xu, C., Cheng, J., & Lu, H. (2011). Tvparser: An automatic tv video parsing method. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2011.5995681"},{"key":"987_CR51","unstructured":"Lin, C.-Y. (2004). Rouge: A package for automatic evaluation of summaries. In Text summarization branches out: Proceedings of the ACL-04 workshop, pp. 74\u201381."},{"key":"987_CR52","doi-asserted-by":"publisher","unstructured":"Lin, T.-Y., Maire, M., Belongie, S., Hays, J., Perona, P., Ramanan, D. et al. (2014). Microsoft coco: Common objects in context. In European Conference on Computer Vision (ECCV).","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"987_CR53","unstructured":"Mao, J., Xu, W., Yang, Y., Wang, J., Huang, Z., & Yuille, A. (2015). Deep captioning with multimodal recurrent neural networks (m-rnn). In International conference on learning representations (ICLR)."},{"key":"987_CR54","doi-asserted-by":"publisher","unstructured":"Marszalek, M., Laptev, I., & Schmid, C. (2009). Actions in context. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2009.5206557"},{"key":"987_CR55","unstructured":"Mitchell, M., Dodge, J., Goyal, A., Yamaguchi, K., Stratos, K., Han, X. et al. (2012). Midge: Generating image descriptions from computer vision detections. In Proceedings of the conference of the European chapter of the association for computational linguistics (EACL)."},{"key":"987_CR56","unstructured":"Ordonez, V., Kulkarni, G., & Berg, T.\u00a0L. (2011). Im2text: Describing images using 1 million captioned photographs. In Conference on neural information processing systems (NIPS)."},{"key":"987_CR57","unstructured":"Over, P., Awad, G., Michel, M., Fiscus, J., Sanders, G., Shaw, B., Smeaton, A.\u00a0F., & Qu\u00e9enot, G. (2012). Trecvid 2012\u2014An overview of the goals, tasks, data, evaluation mechanisms and metrics. In Proceedings of TRECVID 2012. NIST, USA."},{"key":"987_CR58","doi-asserted-by":"publisher","unstructured":"Pan, P., Xu, Z., Yang, Y., Wu, F., & Zhuang, Y. (2016a). Hierarchical recurrent neural encoder for video representation with application to captioning. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.117"},{"key":"987_CR59","doi-asserted-by":"publisher","unstructured":"Pan, Y., Mei, T., Yao, T., Li, H., & Rui, Y. (2016b). Jointly modeling embedding and translation to bridge video and language. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.497"},{"key":"987_CR60","unstructured":"Papineni, K., Roukos, S., Ward, T., & Zhu, W. J. (2002). BLEU: A method for automatic evaluation of machine translation. In Proceedings of the annual meeting of the association for computational linguistics (ACL)."},{"key":"987_CR61","doi-asserted-by":"publisher","unstructured":"Ramanathan, V., Joulin, A., Liang, P., & Fei-Fei, L. (2014). Linking people in videos with \u201ctheir\u201d names using coreference resolution. In European conference on computer vision (ECCV).","DOI":"10.1007\/978-3-319-10590-1_7"},{"key":"987_CR62","doi-asserted-by":"crossref","first-page":"25","DOI":"10.1162\/tacl_a_00207","volume":"1","author":"M Regneri","year":"2013","unstructured":"Regneri, M., Rohrbach, M., Wetzel, D., Thater, S., Schiele, B., & Pinkal, M. (2013). Grounding action descriptions in videos. Transactions of the Association for Computational Linguistics (TACL), 1, 25\u201336.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"key":"987_CR63","doi-asserted-by":"publisher","unstructured":"Rohrbach, A., Rohrbach, M., Qiu, W., Friedrich, A., Pinkal, M., & Schiele, B. (2014). Coherent multi-sentence video description with variable level of detail. In Proceedings of the German conference on pattern recognition (GCPR).","DOI":"10.1007\/978-3-319-11752-2_15"},{"key":"987_CR64","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., & Schiele, B. (2015a). The long-short story of movie description. arXiv:1506.01698 .","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"987_CR65","doi-asserted-by":"publisher","unstructured":"Rohrbach, A., Rohrbach, M., & Schiele, B. (2015b). The long-short story of movie description. In Proceedings of the German Conference on Pattern Recognition (GCPR).","DOI":"10.1007\/978-3-319-24947-6_17"},{"key":"987_CR66","doi-asserted-by":"publisher","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., & Schiele, B. (2015c). A dataset for movie description. In Conference on Computer Vision and Pattern Recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"987_CR67","doi-asserted-by":"publisher","unstructured":"Rohrbach, M., Qiu, W., Titov, I., Thater, S., Pinkal, M., Schiele, B. (2013). Translating video content to natural language descriptions. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2013.61"},{"issue":"3","key":"987_CR68","doi-asserted-by":"publisher","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., et al. (2015). ImageNet large scale visual recognition challenge. International Journal of Computer Vision, 115(3), 211\u2013252.","journal-title":"International Journal of Computer Vision"},{"key":"987_CR69","unstructured":"Salway, A. (2007). A corpus-based analysis of audio description. In Media for all: Subtitling for the deaf, audio description and sign language (pp. 151\u2013174)."},{"key":"987_CR70","doi-asserted-by":"publisher","unstructured":"Salway, A., Lehane, B., & O\u2019Connor, N.\u00a0E. (2007). Associating characters with events in films. In Proceedings of the ACM international conference on image and video retrieval (CIVR).","DOI":"10.1145\/1282280.1282354"},{"key":"987_CR71","doi-asserted-by":"publisher","unstructured":"Schuler, K.\u00a0K., Korhonen, A., & Brown, S.\u00a0W. (2009). Verbnet overview, extensions, mappings and applications. In Proceedings of the conference of the North American chapter of the association for computational linguistics (NAACL).","DOI":"10.3115\/1620950.1620957"},{"key":"987_CR72","unstructured":"Shetty, R., & Laaksonen, J. (2015). Video captioning with recurrent networks based on frame-and video-level features and visual content classification. arXiv:1512.02949 ."},{"key":"987_CR73","doi-asserted-by":"publisher","unstructured":"Shetty, R., & Laaksonen, J. (2016). Frame-and segment-level features and candidate pool evaluation for video caption generation. In Proceedings of the ACM on multimedia conference (MM), pp. 1073\u20131076.","DOI":"10.1145\/2964284.2984062"},{"key":"987_CR74","unstructured":"Simonyan, K., & Zisserman, A. (2015). Very deep convolutional networks for large-scale image recognition. In International conference on learning representations (ICLR)."},{"key":"987_CR75","doi-asserted-by":"crossref","unstructured":"Sivic, J., Everingham, M., & Zisserman, A. (2009). \u201cwho are you?\u201d-learning person specific classifiers from video. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2009.5206513"},{"key":"987_CR76","doi-asserted-by":"crossref","first-page":"207","DOI":"10.1162\/tacl_a_00177","volume":"2","author":"R Socher","year":"2014","unstructured":"Socher, R., Karpathy, A., Le, Q. V., Manning, C. D., & Ng, A. Y. (2014). Grounded compositional semantics for finding and describing images with sentences. Transactions of the Association for Computational Linguistics (TACL), 2, 207\u2013218.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"key":"987_CR77","doi-asserted-by":"publisher","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., & Rabinovich, A. (2015). Going deeper with convolutions. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"987_CR78","doi-asserted-by":"publisher","unstructured":"Tandon, N., de Melo, G., De, A., & Weikum, G. (2015). Knowlywood: Mining activity knowledge from hollywood narratives. In Proceedings on CIKM.","DOI":"10.1145\/2806416.2806583"},{"key":"987_CR79","unstructured":"Tapaswi, M., Baeuml, M., & Stiefelhagen, R. (2012). \u201dknock! knock! who is it?\u201d probabilistic person identification in tv-series. In Conference on computer vision and pattern recognition (CVPR)."},{"key":"987_CR80","doi-asserted-by":"publisher","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., & Fidler, S. (2016). Movieqa: Understanding stories in movies through question-answering. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.501"},{"key":"987_CR81","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., & Mooney, R.\u00a0J. (2014). Integrating language and vision to generate natural language descriptions of videos in the wild. In Proceedings of the international conference on computational linguistics (COLING)."},{"key":"987_CR82","unstructured":"Torabi, A., Pal, C., Larochelle, H., & Courville, A. (2015). Using descriptive video services to create a large data source for video annotation research. arXiv:1503.01070v1 ."},{"key":"987_CR83","unstructured":"Torabi, A., Tandon, N., & Sigal, L. (2016). Learning language-visual embedding for movie understanding with natural-language. arXiv:1609.08124 ."},{"key":"987_CR84","doi-asserted-by":"publisher","unstructured":"Toutanova, K., Klein, D., Manning, C.\u00a0D., & Singer, Y. (2003). Feature-rich part-of-speech tagging with a cyclic dependency network. In NAACL \u201903: Proceedings of the 2003 conference of the North American chapter of the association for computational linguistics on human language technology. Association for Computational Linguistics.","DOI":"10.3115\/1073445.1073478"},{"key":"987_CR85","doi-asserted-by":"publisher","unstructured":"Vedantam, R., Zitnick, C.\u00a0L., & Parikh, D. (2015). Cider: Consensus-based image description evaluation. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"987_CR86","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., & Saenko, K. (2015a) Sequence to sequence\u2014video to text. arXiv:1505.00487v2 .","DOI":"10.1109\/ICCV.2015.515"},{"key":"987_CR87","doi-asserted-by":"publisher","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., & Saenko, K. (2015b) Sequence to sequence\u2014video to text. In International Conference on Computer Vision (ICCV).","DOI":"10.1109\/ICCV.2015.515"},{"key":"987_CR88","doi-asserted-by":"publisher","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., & Saenko, K. (2015c). Translating videos to natural language using deep recurrent neural networks. In Proceedings of the conference of the North American chapter of the association for computational linguistics (NAACL).","DOI":"10.3115\/v1\/N15-1173"},{"key":"987_CR89","unstructured":"Venugopalan, S., Hendricks, L.\u00a0A., Mooney, R., & Saenko, K. (2016). Improving LSTM-based video description with linguistic knowledge mined from text. arXiv:1604.01729 ."},{"key":"987_CR90","doi-asserted-by":"publisher","unstructured":"Vinyals, O., Toshev, A., Bengio, S., & Erhan, D. (2015). Show and tell: A neural image caption generator. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"987_CR91","doi-asserted-by":"publisher","unstructured":"Wang, H. & Schmid, C. (2013). Action recognition with improved trajectories. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2013.441"},{"issue":"1","key":"987_CR92","doi-asserted-by":"publisher","first-page":"60","DOI":"10.1007\/s11263-012-0594-8","volume":"103","author":"H Wang","year":"2013","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., & Liu, C. L. (2013). Dense trajectories and motion boundary descriptors for action recognition. International Journal of Computer Vision (IJCV), 103(1), 60\u201379.","journal-title":"International Journal of Computer Vision (IJCV)"},{"key":"987_CR93","doi-asserted-by":"publisher","unstructured":"Xiao, J., Hays, J., Ehinger, K.\u00a0A., Oliva, A., & Torralba, A. (2010). Sun database: Large-scale scene recognition from abbey to zoo. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2010.5539970"},{"key":"987_CR94","doi-asserted-by":"publisher","unstructured":"Xu, J., Mei, T., Yao, T., & Rui, Y. (2016). Msr-vtt: A large video description dataset for bridging video and language. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.571"},{"key":"987_CR95","unstructured":"Xu, K., Ba, J., Kiros, R., Courville, A., Salakhutdinov, R., Zemel, R., & Bengio, Y. (2015a). Show, attend and tell: Neural image caption generation with visual attention. In International conference on machine learning (ICML)."},{"key":"987_CR96","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., & Corso, J.\u00a0J. (2015b). Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In Conference on artificial intelligence (AAAI).","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"987_CR97","doi-asserted-by":"publisher","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., & Courville, A. (2015). Describing videos by exploiting temporal structure. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2015.512"},{"key":"987_CR98","unstructured":"Yao, L., Ballas, N., Cho, K., Smith, J. R., & Bengio, Y. (2016). Empirical performance upper bounds for image and video captioning. In International conference on learning representations (ICLR)."},{"key":"987_CR99","doi-asserted-by":"crossref","first-page":"67","DOI":"10.1162\/tacl_a_00166","volume":"2","author":"P Young","year":"2014","unstructured":"Young, P., Lai, A., Hodosh, M., & Hockenmaier, J. (2014). From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions. Transactions of the Association for Computational Linguistics (TACL), 2, 67\u201378.","journal-title":"Transactions of the Association for Computational Linguistics (TACL)"},{"key":"987_CR100","doi-asserted-by":"publisher","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., & Xu, W. (2016a). Video paragraph captioning using hierarchical recurrent neural networks. In Conference on computer vision and pattern recognition (CVPR).","DOI":"10.1109\/CVPR.2016.496"},{"key":"987_CR101","unstructured":"Yu, Y., Ko, H., Choi, J., & Kim, G. (2016b). Video captioning and retrieval models with semantic attention. arXiv preprint arXiv:1610.02947 ."},{"key":"987_CR102","doi-asserted-by":"publisher","unstructured":"Zeng, K.-H., Chen, T.-H., Niebles, J.\u00a0C., & Sun, M. (2016). Title generation for user generated videos. In European conference on computer vision.","DOI":"10.1007\/978-3-319-46475-6_38"},{"key":"987_CR103","unstructured":"Zhong, Z., & Ng, H.\u00a0T. (2010). It makes sense: A wide-coverage word sense disambiguation system for free text. In Proceedings of the ACL 2010 system demonstrations."},{"key":"987_CR104","unstructured":"Zhou, B., Lapedriza, A., Xiao, J., Torralba, A., & Oliva, A. (2014). Learning deep features for scene recognition using places database. In Conference on neural information processing systems (NIPS)."},{"key":"987_CR105","unstructured":"Zhu, L., Xu, Z., Yang, Y., & Hauptmann, A.\u00a0G. (2015a). Uncovering temporal context for video question and answering. arXiv:1511.04670 ."},{"key":"987_CR106","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Kiros, R., Zemel, R., Salakhutdinov, R., Urtasun, R., Torralba, A., & Fidler, S. (2015b). Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. In International conference on computer vision (ICCV).","DOI":"10.1109\/ICCV.2015.11"}],"container-title":["International Journal of Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-016-0987-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/article\/10.1007\/s11263-016-0987-1\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/s11263-016-0987-1.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,14]],"date-time":"2025-06-14T19:40:34Z","timestamp":1749930034000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/s11263-016-0987-1"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017,1,25]]},"references-count":106,"journal-issue":{"issue":"1","published-print":{"date-parts":[[2017,5]]}},"alternative-id":["987"],"URL":"https:\/\/doi.org\/10.1007\/s11263-016-0987-1","relation":{},"ISSN":["0920-5691","1573-1405"],"issn-type":[{"value":"0920-5691","type":"print"},{"value":"1573-1405","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017,1,25]]}}}