{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,4,30]],"date-time":"2026-04-30T16:44:05Z","timestamp":1777567445183,"version":"3.51.4"},"publisher-location":"Cham","reference-count":40,"publisher":"Springer International Publishing","isbn-type":[{"value":"9783319541891","type":"print"},{"value":"9783319541907","type":"electronic"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-3-319-54190-7_7","type":"book-chapter","created":{"date-parts":[[2017,3,11]],"date-time":"2017-03-11T05:44:09Z","timestamp":1489211049000},"page":"104-119","source":"Crossref","is-referenced-by-count":13,"title":["Spatio-Temporal Attention Models for Grounded Video Captioning"],"prefix":"10.1007","author":[{"given":"Mihai","family":"Zanfir","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Elisabeta","family":"Marinoiu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Cristian","family":"Sminchisescu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,3,12]]},"reference":[{"key":"7_CR1","unstructured":"Chen, X., Fang, H., Lin, T., Vedantam, R., Gupta, S., Doll\u00e1r, P., Zitnick, C.L.: Microsoft COCO captions: data collection and evaluation server. arXiv preprint arXiv:1504.00325 (2015)"},{"key":"7_CR2","doi-asserted-by":"crossref","unstructured":"Krishna, R., Zhu, Y., Groth, O., Johnson, J., Hata, K., Kravitz, J., Chen, S., Kalantidis, Y., Li, L., Shamma, D.A., Bernstein, M.S., Li, F.: Visual genome: connecting language and vision using crowdsourced dense image annotations. arXiv preprint arXiv:1602.07332 (2016)","DOI":"10.1007\/s11263-016-0981-7"},{"key":"7_CR3","doi-asserted-by":"crossref","unstructured":"Yu, H., Wang, J., Huang, Z., Yang, Y., Xu, W.: Video paragraph captioning using hierarchical recurrent neural networks. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.496"},{"key":"7_CR4","doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.512"},{"key":"7_CR5","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"725","DOI":"10.1007\/978-3-319-10590-1_47","volume-title":"Computer Vision \u2013 ECCV 2014","author":"EH Taralova","year":"2014","unstructured":"Taralova, E.H., Torre, F., Hebert, M.: Motion words for videos. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8689, pp. 725\u2013740. Springer, Cham (2014). doi: 10.1007\/978-3-319-10590-1_47"},{"key":"7_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"737","DOI":"10.1007\/978-3-319-10578-9_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"D Oneata","year":"2014","unstructured":"Oneata, D., Revaud, J., Verbeek, J., Schmid, C.: Spatio-temporal object detection proposals. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8691, pp. 737\u2013752. Springer, Cham (2014). doi: 10.1007\/978-3-319-10578-9_48"},{"key":"7_CR7","doi-asserted-by":"crossref","unstructured":"Fragkiadaki, K., Arbelaez, P., Felsen, P., Malik, J.: Learning to segment moving objects in videos. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7299035"},{"key":"7_CR8","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: ACL (2011)"},{"key":"7_CR9","doi-asserted-by":"crossref","unstructured":"Guadarrama, S., Krishnamoorthy, N., Malkarnenkar, G., Venugopalan, S., Mooney, R., Darrell, T., Saenko, K.: Youtube2text: recognizing and describing arbitrary activities using semantic hierarchies and zero-shot recognition. In: ICCV (2013)","DOI":"10.1109\/ICCV.2013.337"},{"key":"7_CR10","unstructured":"Thomason, J., Venugopalan, S., Guadarrama, S., Saenko, K., Mooney, R.: Integrating language and vision to generate natural language descriptions of videos in the wild. In: COLING (2014)"},{"key":"7_CR11","doi-asserted-by":"crossref","unstructured":"Xu, R., Xiong, C., Chen, W., Corso, J.J.: Jointly modeling deep video and compositional text to bridge vision and language in a unified framework. In: AAAI Conference on Artificial Intelligence (2015)","DOI":"10.1609\/aaai.v29i1.9512"},{"key":"7_CR12","unstructured":"Sutskever, I., Vinyals, O., Le, Q.V.: Sequence to sequence learning with neural networks. In: NIPS (2014)"},{"key":"7_CR13","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: ICLR (2015)"},{"key":"7_CR14","doi-asserted-by":"crossref","unstructured":"Vinyals, O., Toshev, A., Bengio, S., Erhan, D.: Show and tell: a neural image caption generator. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298935"},{"key":"7_CR15","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A.C., Salakhutdinov, R., Zemel, R.S., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: ICML (2015)"},{"key":"7_CR16","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"7_CR17","doi-asserted-by":"crossref","unstructured":"Johnson, J., Karpathy, A., Fei-Fei, L.: Densecap: Fully convolutional localization networks for dense captioning. arXiv preprint arXiv:1511.07571 (2015)","DOI":"10.1109\/CVPR.2016.494"},{"key":"7_CR18","doi-asserted-by":"crossref","unstructured":"Donahue, J., Hendricks, L.A., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"7_CR19","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. In: NAACL HLT (2015)","DOI":"10.3115\/v1\/N15-1173"},{"key":"7_CR20","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"209","DOI":"10.1007\/978-3-319-24947-6_17","volume-title":"Pattern Recognition","author":"A Rohrbach","year":"2015","unstructured":"Rohrbach, A., Rohrbach, M., Schiele, B.: The long-short story of movie description. In: Gall, J., Gehler, P., Leibe, B. (eds.) GCPR 2015. LNCS, vol. 9358, pp. 209\u2013221. Springer, Cham (2015). doi: 10.1007\/978-3-319-24947-6_17"},{"key":"7_CR21","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2014)"},{"key":"7_CR22","doi-asserted-by":"crossref","unstructured":"Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3D convolutional networks. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"7_CR23","unstructured":"Hochreiter, S., Bengio, Y., Frasconi, P., Schmidhuber, J.: Gradient flow in recurrent nets: the difficulty of learning long-term dependencies (2001)"},{"key":"7_CR24","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9, 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"7_CR25","doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Rohrbach, M., Donahue, J., Mooney, R., Darrell, T., Saenko, K.: Sequence to sequence - video to text. In: ICCV (2015)","DOI":"10.1109\/ICCV.2015.515"},{"key":"7_CR26","unstructured":"Xu, H., Venugopalan, S., Ramanishka, V., Rohrbach, M., Saenko, K.: A multi-scale multiple instance video description network. In: arXiv preprint arXiv:1505.05914 (2015)"},{"key":"7_CR27","unstructured":"Zaremba, W., Sutskever, I.: Learning to execute. arXiv preprint arXiv:1410.4615 (2014)"},{"key":"7_CR28","doi-asserted-by":"crossref","unstructured":"Manning, C.D., Surdeanu, M., Bauer, J., Finkel, J., Bethard, S.J., McClosky, D.: The Stanford CoreNLP natural language processing toolkit. In: ACL (2014)","DOI":"10.3115\/v1\/P14-5010"},{"key":"7_CR29","doi-asserted-by":"crossref","unstructured":"Cawley, G.C.: Leave-one-out cross-validation based model selection criteria for weighted ls-svms. In: IJCNN (2006)","DOI":"10.1109\/IJCNN.2006.246634"},{"key":"7_CR30","doi-asserted-by":"crossref","unstructured":"Papineni, K., Roukos, S., Ward, T., Zhu, W.J.: Bleu: a method for automatic evaluation of machine translation. In: ACL (2002)","DOI":"10.3115\/1073083.1073135"},{"key":"7_CR31","unstructured":"Lavie, A., Agarwal, A.: Meteor: an automatic metric for MT evaluation with improved correlation with human judgments, pp. 65\u201372 (2005)"},{"key":"7_CR32","doi-asserted-by":"crossref","unstructured":"Lienhart, R.W.: Comparison of automatic shot boundary detection algorithms. In: International Society for Optics and Photonics on Electronic Imaging 1999, pp. 290\u2013301 (1998)","DOI":"10.1117\/12.333848"},{"key":"7_CR33","doi-asserted-by":"crossref","first-page":"211","DOI":"10.1007\/s11263-015-0816-y","volume":"115","author":"O Russakovsky","year":"2015","unstructured":"Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L.: ImageNet large scale visual recognition challenge. IJCV 115, 211\u2013252 (2015)","journal-title":"IJCV"},{"key":"7_CR34","unstructured":"Ren, S., He, K., Girshick, R., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: NIPS (2015)"},{"key":"7_CR35","doi-asserted-by":"crossref","unstructured":"Jia, Y., Shelhamer, E., Donahue, J., Karayev, S., Long, J., Girshick, R., Guadarrama, S., Darrell, T.: Caffe: convolutional architecture for fast feature embedding. In: ACMMM (2014)","DOI":"10.1145\/2647868.2654889"},{"key":"7_CR36","doi-asserted-by":"crossref","unstructured":"Wang, H., Kl\u00e4ser, A., Schmid, C., Liu, C.L.: Action recognition by dense trajectories. In: CVPR. IEEE (2011)","DOI":"10.1109\/CVPR.2011.5995407"},{"key":"7_CR37","doi-asserted-by":"crossref","unstructured":"Gkioxari, G., Malik, J.: Finding action tubes. In: CVPR (2015)","DOI":"10.1109\/CVPR.2015.7298676"},{"key":"7_CR38","doi-asserted-by":"crossref","unstructured":"Pan, Y., T.M., Yao, T., Li, H., Rui, Y.: Jointly modeling embedding and translation to bridge video and language. In: arXiv preprint arXiv:1505.01861 . (2015)","DOI":"10.1109\/CVPR.2016.497"},{"key":"7_CR39","doi-asserted-by":"crossref","unstructured":"Pan, P., Xu, Z., Yang, Y., Wu, F., Zhuang, Y.: Hierarchical recurrent neural encoder for video representation with application to captioning. arXiv preprint arXiv:1511.03476 (2015)","DOI":"10.1109\/CVPR.2016.117"},{"key":"7_CR40","unstructured":"Ballas, N., Yao, L., Pal, C., Courville, A.C.: Delving deeper into convolutional networks for learning video representations. arXiv preprint arXiv:1511.06432 (2015)"}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2016"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-319-54190-7_7","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,7,25]],"date-time":"2022-07-25T23:31:25Z","timestamp":1658791885000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-319-54190-7_7"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9783319541891","9783319541907"],"references-count":40,"URL":"https:\/\/doi.org\/10.1007\/978-3-319-54190-7_7","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"value":"0302-9743","type":"print"},{"value":"1611-3349","type":"electronic"}],"subject":[],"published":{"date-parts":[[2017]]}}}