{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,8,23]],"date-time":"2025-08-23T05:15:36Z","timestamp":1755926136555,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Singapore","isbn-type":[{"type":"print","value":"9789811072987"},{"type":"electronic","value":"9789811072994"}],"license":[{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2017,1,1]],"date-time":"2017-01-01T00:00:00Z","timestamp":1483228800000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2017]]},"DOI":"10.1007\/978-981-10-7299-4_33","type":"book-chapter","created":{"date-parts":[[2017,11,29]],"date-time":"2017-11-29T16:27:13Z","timestamp":1511972833000},"page":"404-415","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["Video Question Answering Using a Forget Memory Network"],"prefix":"10.1007","author":[{"given":"Yuanyuan","family":"Ge","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Youjiang","family":"Xu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yahong","family":"Han","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2017,11,30]]},"reference":[{"key":"33_CR1","unstructured":"Bahdanau, D., Cho, K., Bengio, Y.: Neural machine translation by jointly learning to align and translate. In: International Conference on Learning Representations (ICLR) (2015)"},{"key":"33_CR2","doi-asserted-by":"crossref","unstructured":"Cho, K., Van Merri\u00ebnboer, B., Gulcehre, C., Bahdanau, D., Bougares, F., Schwenk, H., Bengio, Y.: Learning phrase representations using RNN encoder-decoder for statistical machine translation. arXiv preprint arXiv:1406.1078 (2014)","DOI":"10.3115\/v1\/D14-1179"},{"key":"33_CR3","doi-asserted-by":"crossref","unstructured":"Das, R., Zaheer, M., Reddy, S., McCallum, A.: Question answering on knowledge bases and text using universal schema and memory networks. arXiv preprint arXiv:1704.08384 (2017)","DOI":"10.18653\/v1\/P17-2057"},{"key":"33_CR4","doi-asserted-by":"crossref","unstructured":"Donahue, J., Anne Hendricks, L., Guadarrama, S., Rohrbach, M., Venugopalan, S., Saenko, K., Darrell, T.: Long-term recurrent convolutional networks for visual recognition and description. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 2625\u20132634 (2015)","DOI":"10.1109\/CVPR.2015.7298878"},{"key":"33_CR5","doi-asserted-by":"crossref","unstructured":"He, K., Ren, X.Z.S., Sun, J.: Deep residual learning for image recognition. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 770\u2013778 (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"33_CR6","unstructured":"Heilman, M., Smith, N.A.: Good question! statistical ranking for question generation. In: Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics, pp. 609\u2013617. Association for Computational Linguistics (2010)"},{"key":"33_CR7","unstructured":"Jiang, A., Wang, F., Porikli, F., Li, Y.: Compositional memory for visual question answering. arXiv preprint arXiv:1511.05676 (2015)"},{"key":"33_CR8","doi-asserted-by":"crossref","unstructured":"Karpathy, A., Fei-Fei, L.: Deep visual-semantic alignments for generating image descriptions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 3128\u20133137 (2015)","DOI":"10.1109\/CVPR.2015.7298932"},{"key":"33_CR9","unstructured":"Kingma, D., Ba, J.: Adam: a method for stochastic optimization. arXiv preprint arXiv:1412.6980 (2014)"},{"key":"33_CR10","unstructured":"Krizhevsky, A., Sutskever, I., Hinton, G.E.: Imagenet classification with deep convolutional neural networks. In: Advances in Neural Information Processing Systems, pp. 1097\u20131105 (2012)"},{"key":"33_CR11","unstructured":"Kumar, A., Irsoy, O., Ondruska, P., Iyyer, M., Bradbury, J., Gulrajani, I., Zhong, V., Paulus, R., Socher, R.: Ask me anything: dynamic memory networks for natural language processing. In: International Conference on Machine Learning, pp. 1378\u20131387 (2016)"},{"key":"33_CR12","doi-asserted-by":"crossref","unstructured":"Li, G., Ma, S., Han, Y.: Summarization-based video caption via deep neural networks. In: Proceedings of the 23rd ACM International Conference on Multimedia, pp. 1191\u20131194. ACM (2015)","DOI":"10.1145\/2733373.2806314"},{"key":"33_CR13","unstructured":"Mikolov, T., Sutskever, I., Chen, K., Corrado, G.S., Dean, J.: Distributed representations of words and phrases and their compositionality. In: Advances in Neural Information Processing Systems, pp. 3111\u20133119 (2013)"},{"key":"33_CR14","doi-asserted-by":"crossref","unstructured":"Noh, H., Hongsuck Seo, P., Han, B.: Image question answering using convolutional neural network with dynamic parameter prediction. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 30\u201338 (2016)","DOI":"10.1109\/CVPR.2016.11"},{"key":"33_CR15","doi-asserted-by":"crossref","unstructured":"Redmon, J., Farhadi, A.: YOLO9000: better, faster, stronger. arXiv preprint arXiv:1612.08242 (2016)","DOI":"10.1109\/CVPR.2017.690"},{"key":"33_CR16","unstructured":"Ren, S., He, K., Ross, G., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. In: Advances in Neural Information Processing Systems, pp. 91\u201399 (2015)"},{"key":"33_CR17","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., Rohrbach, M., Tandon, N., Schiele, B.: A dataset for movie description. In: Proceedings of the IEEE Conference on Computer Vision And Pattern Recognition, pp. 3202\u20133212 (2015)","DOI":"10.1109\/CVPR.2015.7298940"},{"key":"33_CR18","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. arXiv preprint arXiv:1409.1556 (2014)"},{"key":"33_CR19","unstructured":"Sukhbaatar, S., Arthur, S., Weston, J., Fergus, R.: End-to-end memory networks. In: Advances in Neural Information Processing Systems, pp. 2440\u20132448 (2015)"},{"key":"33_CR20","doi-asserted-by":"crossref","unstructured":"Szegedy, C., Liu, W., Jia, Y., Sermanet, P., Reed, S., Anguelov, D., Erhan, D., Vanhoucke, V., Rabinovich, A.: Going deeper with convolutions. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 1\u20139 (2015)","DOI":"10.1109\/CVPR.2015.7298594"},{"key":"33_CR21","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4631\u20134640 (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"33_CR22","unstructured":"Weston, J., Chopra, S., Bordes, A.: Memory networks. arXiv preprint arXiv:1410.3916 (2014)"},{"key":"33_CR23","unstructured":"Xiong, C., Merity, S., Socher, R.: Dynamic memory networks for visual and textual question answering. In: International Conference on Machine Learning, pp. 2397\u20132406 (2016)"},{"key":"33_CR24","unstructured":"Xu, K., Ba, J., Kiros, R., Cho, K., Courville, A., Salakhudinov, R., Zemel, R., Bengio, Y.: Show, attend and tell: neural image caption generation with visual attention. In: International Conference on Machine Learning, pp. 2048\u20132057 (2015)"},{"key":"33_CR25","doi-asserted-by":"crossref","unstructured":"Yang, Z., He, X., Gao, J., Li, D., Smola, A.: Stacked attention networks for image question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 21\u201329 (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"33_CR26","doi-asserted-by":"crossref","unstructured":"Yang, Z., Han, Y., Wang, Z.: Catching the temporal regions-of-interest for video captioning. In: Proceedings of the ACM International Conference on Multimedia (ACM MM) (2017)","DOI":"10.1145\/3123266.3123327"},{"key":"33_CR27","doi-asserted-by":"crossref","unstructured":"Zeng, K.H., Chen, T.H., Chuang, C.Y., Liao, Y.H., Niebles, J.C., Sun, M.: Leveraging video descriptions to learn video question answering. In: AAAI, pp. 4334\u20134340 (2017)","DOI":"10.1609\/aaai.v31i1.11238"},{"key":"33_CR28","unstructured":"Zhu, L., Xu, Z., Yang, Y., Hauptmann, A.G.: Uncovering temporal context for video question and answering. arXiv preprint arXiv:1511.04670 (2015)"},{"key":"33_CR29","doi-asserted-by":"crossref","unstructured":"Zhu, Y., Groth, O., Bernstein, M., Li, F.F.: Visual7W: grounded question answering in images. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp. 4995\u20135004 (2016)","DOI":"10.1109\/CVPR.2016.540"}],"container-title":["Communications in Computer and Information Science","Computer Vision"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-10-7299-4_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,7]],"date-time":"2024-03-07T14:15:16Z","timestamp":1709820916000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-10-7299-4_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2017]]},"ISBN":["9789811072987","9789811072994"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-10-7299-4_33","relation":{},"ISSN":["1865-0929","1865-0937"],"issn-type":[{"type":"print","value":"1865-0929"},{"type":"electronic","value":"1865-0937"}],"subject":[],"published":{"date-parts":[[2017]]},"assertion":[{"value":"30 November 2017","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"CCCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF Chinese Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Tianjin","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2017","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"11 October 2017","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14 October 2017","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"cccv2017","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/ccf-cccv.org\/2017\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"This content has been made available to all.","name":"free","label":"Free to read"}]}}