{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2024,9,10]],"date-time":"2024-09-10T08:51:51Z","timestamp":1725958311690},"publisher-location":"Singapore","reference-count":22,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811085291"},{"type":"electronic","value":"9789811085307"}],"license":[{"start":{"date-parts":[[2018,1,1]],"date-time":"2018-01-01T00:00:00Z","timestamp":1514764800000},"content-version":"unspecified","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2018]]},"DOI":"10.1007\/978-981-10-8530-7_34","type":"book-chapter","created":{"date-parts":[[2018,2,28]],"date-time":"2018-02-28T15:04:19Z","timestamp":1519830259000},"page":"349-359","update-policy":"http:\/\/dx.doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":1,"title":["Initialized Frame Attention Networks for Video Question Answering"],"prefix":"10.1007","author":[{"given":"Kun","family":"Gao","sequence":"first","affiliation":[]},{"given":"Xianglei","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yahong","family":"Han","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2018,3,1]]},"reference":[{"issue":"8","key":"34_CR1","doi-asserted-by":"crossref","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Fei-Fei, L.: Visual7W: grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","key":"34_CR2","DOI":"10.1109\/CVPR.2016.540"},{"doi-asserted-by":"crossref","unstructured":"Venugopalan, S., Xu, H., Donahue, J., Rohrbach, M., Mooney, R., Saenko, K.: Translating videos to natural language using deep recurrent neural networks. arXiv preprint arXiv:1412.4729 (2014)","key":"34_CR3","DOI":"10.3115\/v1\/N15-1173"},{"doi-asserted-by":"crossref","unstructured":"Yao, L., Torabi, A., Cho, K., Ballas, N., Pal, C., Larochelle, H., Courville, A.: Describing videos by exploiting temporal structure. In: Proceedings of the IEEE International Conference on Computer Vision, pp. 4507\u20134515 (2015)","key":"34_CR4","DOI":"10.1109\/ICCV.2015.512"},{"doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Deng, L., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","key":"34_CR5","DOI":"10.1109\/CVPR.2016.10"},{"key":"34_CR6","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"451","DOI":"10.1007\/978-3-319-46478-7_28","volume-title":"Computer Vision \u2013 ECCV 2016","author":"H Xu","year":"2016","unstructured":"Xu, H., Saenko, K.: Ask, attend and answer: exploring question-guided spatial attention for visual question answering. In: Leibe, B., Matas, J., Sebe, N., Welling, M. (eds.) ECCV 2016. LNCS, vol. 9911, pp. 451\u2013466. Springer, Cham (2016). https:\/\/doi.org\/10.1007\/978-3-319-46478-7_28"},{"doi-asserted-by":"crossref","unstructured":"Shih, K.J., Singh, S., Hoiem, D.: Where to look: focus regions for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4613\u20134621 (2016)","key":"34_CR7","DOI":"10.1109\/CVPR.2016.499"},{"unstructured":"Chen, K., Wang, J., Chen, L.C., Gao, H., Xu, W., Nevatia, R.: ABC-CNN: an attention based convolutional neural network for visual question answering. arXiv preprint arXiv:1511.05960 (2015)","key":"34_CR8"},{"issue":"2","key":"34_CR9","doi-asserted-by":"crossref","first-page":"42","DOI":"10.1109\/MMUL.2014.29","volume":"21","author":"K Tu","year":"2014","unstructured":"Tu, K., Meng, M., Lee, M.W., Choe, T.E., Zhu, S.C.: Joint video and text parsing for understanding events and answering queries. IEEE MultiMed. 21(2), 42\u201370 (2014)","journal-title":"IEEE MultiMed."},{"unstructured":"Zhu, L., Xu, Z., Yang, Y., Hauptmann, A.G.: Uncovering temporal context for video question and answering. arXiv preprint arXiv:1511.04670 (2015)","key":"34_CR10"},{"unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)","key":"34_CR11"},{"doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","key":"34_CR12","DOI":"10.1109\/CVPR.2015.7298932"},{"unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems, pp. 1097\u20131105 (2012)","key":"34_CR13"},{"unstructured":"Simonyan, K., Zisserman, A.: Two-stream convolutional networks for action recognition in videos. In: Advances in Neural Information Processing Systems, pp. 568\u2013576 (2014)","key":"34_CR14"},{"doi-asserted-by":"crossref","unstructured":"Ma, L., Lu, Z., Li, H.: Learning to answer questions from image using convolutional neural network. In: AAAI, p. 16 (2016)","key":"34_CR15","DOI":"10.1609\/aaai.v30i1.10442"},{"doi-asserted-by":"crossref","unstructured":"Zeng, K.H., Chen, T.H., Chuang, C.Y., Liao, Y.H., Niebles, J.C., Sun, M.: Leveraging video descriptions to learn video question answering. In: AAAI, pp. 4334\u20134340 (2017)","key":"34_CR16","DOI":"10.1609\/aaai.v31i1.11238"},{"key":"34_CR17","doi-asserted-by":"crossref","first-page":"721","DOI":"10.1016\/S0167-8655(98)00050-6","volume":"19","author":"DL Vilari\u00f1o","year":"1998","unstructured":"Vilari\u00f1o, D.L., Brea, V.M., Cabello, D., Pardo, J.M.: Discrete-time CNN for image segmentation by active contours. Pattern Recogn. Lett. 19, 721\u2013734 (1998)","journal-title":"Pattern Recogn. Lett."},{"key":"34_CR18","doi-asserted-by":"crossref","first-page":"1555","DOI":"10.1109\/TMM.2016.2567071","volume":"18","author":"R Hong","year":"2016","unstructured":"Hong, R., Zhang, L., Zhang, C., Zimmermann, R.: Flickr circles: aesthetic tendency discovery by multi-view regularized topic modeling. IEEE Trans. Multimed. 18, 1555\u20131567 (2016)","journal-title":"IEEE Trans. Multimed."},{"key":"34_CR19","doi-asserted-by":"crossref","first-page":"5814","DOI":"10.1109\/TIP.2016.2614132","volume":"25","author":"R Hong","year":"2016","unstructured":"Hong, R., Hu, Z., Wang, R., Wang, M., Tao, D.: Multi-view object retrieval via multi-scale topic models. IEEE Trans. Image Process. 25, 5814\u20135827 (2016)","journal-title":"IEEE Trans. Image Process."},{"key":"34_CR20","doi-asserted-by":"crossref","first-page":"152","DOI":"10.1109\/TBDATA.2016.2515640","volume":"1","author":"R Hong","year":"2015","unstructured":"Hong, R., Yang, Y., Wang, M., Hua, X.S.: Learning visual semantic relationships for efficient visual retrieval. IEEE Trans. Big Data 1, 152\u2013161 (2015)","journal-title":"IEEE Trans. Big Data"},{"doi-asserted-by":"crossref","unstructured":"Yang, Z., Han, Y., Wang Z.: Catching the temporal regions-of-interest for video captioning. In: Proceedings of the ACM International Conference on Multimedia, MM 2017. ACM (2017)","key":"34_CR21","DOI":"10.1145\/3123266.3123327"},{"doi-asserted-by":"crossref","unstructured":"Li, G., Ma, S., Han, Y.: Summarization-based video caption via deep neural networks. In: Proceedings of the 23rd ACM International Conference on Multimedia, pp. 1191\u20131194 (2015)","key":"34_CR22","DOI":"10.1145\/2733373.2806314"}],"container-title":["Communications in Computer and Information Science","Internet Multimedia Computing and Service"],"original-title":[],"link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-10-8530-7_34","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,8,15]],"date-time":"2022-08-15T01:06:16Z","timestamp":1660525576000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-981-10-8530-7_34"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2018]]},"ISBN":["9789811085291","9789811085307"],"references-count":22,"URL":"https:\/\/doi.org\/10.1007\/978-981-10-8530-7_34","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2018]]}}}