{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,1]],"date-time":"2026-05-01T17:03:13Z","timestamp":1777654993066,"version":"3.51.4"},"publisher-location":"Cham","reference-count":48,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783030012601","type":"print"},{"value":"9783030012618","type":"electronic"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"},{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-3-030-01261-8_22","type":"book-chapter","created":{"date-parts":[[2018,10,8]],"date-time":"2018-10-08T12:14:51Z","timestamp":1539000891000},"page":"367-384","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":153,"title":["Less Is More: Picking Informative Frames for Video Captioning"],"prefix":"10.1007","author":[{"given":"Yangyu","family":"Chen","sequence":"first","affiliation":[]},{"given":"Shuhui","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Weigang","family":"Zhang","sequence":"additional","affiliation":[]},{"given":"Qingming","family":"Huang","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,10,6]]},"reference":[{"key":"22_CR1","unstructured":"Banerjee, S., Lavie, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments. In: ACL, pp. 65\u201372 (2005)"},{"key":"22_CR2","doi-asserted-by":"crossref","unstructured":"Baraldi, L., Grana, C., Cucchiara, R.: Hierarchical boundary-aware neural encoder for video captioning. In: CVPR, pp. 3185\u20133194 (2017)","DOI":"10.1109\/CVPR.2017.339"},{"key":"22_CR3","unstructured":"Bengio, S., Vinyals, O., Jaitly, N., Shazeer, N.: Scheduled sampling for sequence prediction with recurrent neural networks. In: NIPS, pp. 1171\u20131179 (2015)"},{"key":"22_CR4","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL, pp. 190\u2013200 (2011)"},{"key":"22_CR5","doi-asserted-by":"crossref","unstructured":"Cho, K., et al.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. In: EMNLP, pp. 1724\u20131734 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"issue":"2","key":"22_CR6","doi-asserted-by":"publisher","first-page":"69","DOI":"10.1177\/155005940803900209","volume":"39","author":"HC Cromwell","year":"2008","unstructured":"Cromwell, H.C., Mears, R.P., Wan, L., Boutros, N.N.: Sensory gating: a translational effort from basic to clinical science. Clinical EEG Neurosci. 39(2), 69\u201372 (2008)","journal-title":"Clinical EEG Neurosci."},{"key":"22_CR7","doi-asserted-by":"crossref","unstructured":"Dong, J., Li, X., Lan, W., Huo, Y., Snoek, C.G.M.: Early embedding and late reranking for video captioning. In: ACM Multimedia, pp. 1082\u20131086 (2016)","DOI":"10.1145\/2964284.2984064"},{"key":"22_CR8","doi-asserted-by":"crossref","unstructured":"Fang, H., et al.: From captions to visual concepts and back. In: CVPR, pp. 1473\u20131482 (2015)","DOI":"10.1109\/CVPR.2015.7298754"},{"key":"22_CR9","doi-asserted-by":"crossref","unstructured":"Farhadi, A., Hejrati, M., Sadeghi, M.A., Young, P., Rashtchian, C., Hockenmaier, J., Forsyth, D.: Every picture tells a story: Generating sentences from images. In: ECCV, pp. 15\u201329 (2010)","DOI":"10.1007\/978-3-642-15561-1_2"},{"key":"22_CR10","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. In: CVPR, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"22_CR11","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.J.J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"22_CR12","doi-asserted-by":"crossref","unstructured":"Hori, C., Hori, T., Lee, T.Y., Sumi, K., Hershey, J.R., Marks, T.K.: Attention-based multimodal fusion for video description. In: ICCV, pp. 4203\u20134212 (2017)","DOI":"10.1109\/ICCV.2017.450"},{"issue":"11","key":"22_CR13","doi-asserted-by":"publisher","first-page":"1254","DOI":"10.1109\/34.730558","volume":"20","author":"L Itti","year":"1998","unstructured":"Itti, L., Koch, C., Niebur, E.: A model of saliency-based visual attention for rapid scene analysis. IEEE Trans. Pattern Anal. Mach. Intell. 20(11), 1254\u20131259 (1998)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"22_CR14","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: Fully convolutional localization networks for dense captioning. In: CVPR, pp. 4565\u20134574 (2016)","DOI":"10.1109\/CVPR.2016.494"},{"key":"22_CR15","unstructured":"Kingma, D.P., Ba, J.L.: Adam: a method for stochastic optimization. In: ICLR (2015)"},{"issue":"2","key":"22_CR16","doi-asserted-by":"publisher","first-page":"171","DOI":"10.1023\/A:1020346032608","volume":"50","author":"A Kojima","year":"2002","unstructured":"Kojima, A., Tamura, T., Fukunaga, K.: Natural language description of human activities from video images based on concept hierarchy of actions. IJCV 50(2), 171\u2013184 (2002)","journal-title":"IJCV"},{"key":"22_CR17","doi-asserted-by":"crossref","unstructured":"Krause, J., Johnson, J., Krishna, R., Fei-Fei, L.: A hierarchical approach for generating descriptive image paragraphs. In: CVPR, pp. 3337\u20133345 (2017)","DOI":"10.1109\/CVPR.2017.356"},{"key":"22_CR18","doi-asserted-by":"crossref","unstructured":"Kulkarni, G., et al.: Baby talk: Understanding and generating image descriptions. In: CVPR, pp. 1601\u20131608 (2011)","DOI":"10.1109\/CVPR.2011.5995466"},{"key":"22_CR19","unstructured":"Lin, C.Y.: Rouge: A package for automatic evaluation of summaries. In: ACL (2004)"},{"key":"22_CR20","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical co-attention for visual question answering. In: NIPS, pp. 289\u2013297 (2016)"},{"key":"22_CR21","unstructured":"Mnih, V., Heess, N., Graves, A., Kavukcuoglu, K.: Recurrent models of visual attention. In: NIPS, pp. 2204\u20132212 (2014)"},{"key":"22_CR22","doi-asserted-by":"crossref","unstructured":"Pan, P., Xu, Z., Yang, Y., Wu, F., Zhuang, Y.: Hierarchical recurrent neural encoder for video representation with application to captioning. In: CVPR, pp. 1029\u20131038 (2016)","DOI":"10.1109\/CVPR.2016.117"},{"key":"22_CR23","doi-asserted-by":"crossref","unstructured":"Pan, Y., Mei, T., Yao, T., Li, H., Rui, Y.: Jointly modeling embedding and translation to bridge video and language. In: CVPR, pp. 4594\u20134602 (2016)","DOI":"10.1109\/CVPR.2016.497"},{"key":"22_CR24","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL, pp. 311\u2013318 (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"22_CR25","unstructured":"Ranzato, M., Chopra, S., Auli, M., Zaremba, W.: Sequence level training with recurrent neural networks. In: ICLR (2016)"},{"key":"22_CR26","doi-asserted-by":"crossref","unstructured":"Rennie, S.J., Marcheret, E., Mroueh, Y., Ross, J., Goel, V.: Self-critical sequence training for image captioning. In: CVPR, pp. 1179\u20131195 (2017)","DOI":"10.1109\/CVPR.2017.131"},{"key":"22_CR27","doi-asserted-by":"crossref","unstructured":"Shen, Z., et al.: Weakly supervised dense video captioning. In: CVPR, pp. 5159\u20135167 (2017)","DOI":"10.1109\/CVPR.2017.548"},{"key":"22_CR28","doi-asserted-by":"crossref","unstructured":"Shetty, R., Laaksonen, J.: Frame-and segment-level features and candidate pool evaluation for video caption generation. In: ACM Multimedia, pp. 1073\u20131076 (2016)","DOI":"10.1145\/2964284.2984062"},{"key":"22_CR29","unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: NIPS, pp. 568\u2013576 (2014)"},{"key":"22_CR30","unstructured":"Song, J., Guo, Y., Gao, L., Li, X., Hanjalic, A., Shen, H.T.: From deterministic to generative: multi-modal stochastic RNNs for video captioning. arXiv (2017)"},{"key":"22_CR31","doi-asserted-by":"crossref","unstructured":"Song, Y., Redi, M., Vallmitjana, J., Jaimes, A.: To click or not to click: automatic selection of beautiful thumbnails from videos. In: CIKM, pp. 659\u2013668 (2016)","DOI":"10.1145\/2983323.2983349"},{"key":"22_CR32","doi-asserted-by":"crossref","unstructured":"Song, Y., Vallmitjana, J., Stent, A., Jaimes, A.: Tvsum: Summarizing web videos using titles. In: CVPR, pp. 5179\u20135187 (2015)","DOI":"10.1109\/CVPR.2015.7299154"},{"key":"22_CR33","doi-asserted-by":"crossref","unstructured":"Vedantam, R., Zitnick, C.L., Parikh, D.: Cider: consensus-based image description evaluation. In: CVPR, pp. 4566\u20134575 (2015)","DOI":"10.1109\/CVPR.2015.7299087"},{"key":"22_CR34","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Darrell, T., Donahue, J., Saenko, K., Mooney, R.: Sequence to sequence - video to text. In: ICCV, pp. 4534\u20134542 (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"22_CR35","doi-asserted-by":"crossref","unstructured":"Wang, B., Ma, L., Zhang, W., Liu, W.: Reconstruction network for video captioning. In: CVPR, pp. 7622\u20137631 (2018)","DOI":"10.1109\/CVPR.2018.00795"},{"key":"22_CR36","doi-asserted-by":"crossref","unstructured":"Wang, J., Jiang, W., Ma, L., Liu, W., Xu, Y.: Bidirectional attentive fusion with context gating for dense video captioning. In: CVPR, pp. 7190\u20137198 (2018)","DOI":"10.1109\/CVPR.2018.00751"},{"issue":"3\u20134","key":"22_CR37","doi-asserted-by":"crossref","first-page":"229","DOI":"10.1023\/A:1022672621406","volume":"8","author":"RJ Williams","year":"1992","unstructured":"Williams, R.J.: Simple statistical gradient-following algorithms for connectionist reinforcement learning. Mach. Learn. 8(3\u20134), 229\u2013256 (1992)","journal-title":"Mach. Learn."},{"key":"22_CR38","doi-asserted-by":"crossref","unstructured":"Xu, J., Mei, T., Yao, T., Rui, Y.: Msr-vtt: A large video description dataset for bridging video and language. In: CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"22_CR39","unstructured":"Xu, K., et al.: Show, attend and tell: neural image caption generation with visual attention. In: ICML, pp. 2048\u20132057 (2015)"},{"key":"22_CR40","unstructured":"Yang, Y., Teo, C.L., Daum\u00e9 III, H., Aloimonos, Y.: Corpus-guided sentence generation of natural images. In: EMNLP, pp. 444\u2013454 (2011)"},{"key":"22_CR41","doi-asserted-by":"crossref","unstructured":"Yao, L., Cho, K., Ballas, N., Pa\u00ed, C., Courville, A.: Describing videos by exploiting temporal structure. In: ICCV, pp. 4507\u20134515 (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"22_CR42","doi-asserted-by":"crossref","unstructured":"Yeung, S., Russakovsky, O., Mori, G., Fei-Fei, L.: End-to-end learning of action detection from frame glimpses in videos. In: CVPR, pp. 2678\u20132687 (2016)","DOI":"10.1109\/CVPR.2016.293"},{"key":"22_CR43","doi-asserted-by":"crossref","unstructured":"You, Q., Jin, H., Wang, Z., Fang, C., Luo, J.: Image captioning with semantic attention. In: CVPR, pp. 4651\u20134659 (2016)","DOI":"10.1109\/CVPR.2016.503"},{"key":"22_CR44","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., Xu, W.: Video paragraph captioning using hierarchical recurrent neural networks. In: CVPR, pp. 4584\u20134593 (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"22_CR45","doi-asserted-by":"crossref","unstructured":"Yu, Y., et al.: Supervising neural attention models for video captioning by human gaze data. In: CVPR, pp. 6119\u20136127 (2017)","DOI":"10.1109\/CVPR.2017.648"},{"key":"22_CR46","doi-asserted-by":"crossref","unstructured":"Zeng, K., Chen, T., Niebles, J.C., Sun, M.: Title generation for user generated videos. In: ECCV, pp. 609\u2013625 (2016)","DOI":"10.1007\/978-3-319-46475-6_38"},{"key":"22_CR47","doi-asserted-by":"crossref","unstructured":"Zhao, B., Xing, E.P.: Quasi real-time summarization for consumer videos. In: CVPR, pp. 2513\u20132520 (2014)","DOI":"10.1109\/CVPR.2014.322"},{"key":"22_CR48","doi-asserted-by":"crossref","unstructured":"Zheng, H., Fu, J., Mei, T.: Look closer to see better: Recurrent attention convolutional neural network for fine-grained image recognition. In: CVPR, pp. 4476\u20134484 (2017)","DOI":"10.1109\/ICCV.2017.557"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ECCV 2018"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-01261-8_22","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2026,4,3]],"date-time":"2026-04-03T18:58:55Z","timestamp":1775242735000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-030-01261-8_22"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9783030012601","9783030012618"],"references-count":48,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-01261-8_22","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2018]]},"assertion":[{"value":"6 October 2018","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ECCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"European Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Munich","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Germany","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"8 September 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 September 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"15","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"eccv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"https:\/\/eccv2018.org\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}