{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,27]],"date-time":"2025-03-27T17:23:21Z","timestamp":1743096201042,"version":"3.40.3"},"publisher-location":"Cham","reference-count":34,"publisher":"Springer International Publishing","isbn-type":[{"type":"print","value":"9783030208752"},{"type":"electronic","value":"9783030208769"}],"license":[{"start":{"date-parts":[[2019,1,1]],"date-time":"2019-01-01T00:00:00Z","timestamp":1546300800000},"content-version":"tdm","delay-in-days":0,"URL":"http:\/\/www.springer.com\/tdm"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2019]]},"DOI":"10.1007\/978-3-030-20876-9_26","type":"book-chapter","created":{"date-parts":[[2019,5,25]],"date-time":"2019-05-25T14:03:53Z","timestamp":1558793033000},"page":"404-419","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":2,"title":["A2A: Attention to Attention Reasoning for Movie Question Answering"],"prefix":"10.1007","author":[{"given":"Chao-Ning","family":"Liu","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ding-Jie","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Hwann-Tzong","family":"Chen","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Tyng-Luh","family":"Liu","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2019,5,26]]},"reference":[{"issue":"1","key":"26_CR1","doi-asserted-by":"publisher","first-page":"4","DOI":"10.1007\/s11263-016-0966-6","volume":"123","author":"A Agrawal","year":"2017","unstructured":"Agrawal, A., et al.: VQA: visual question answering. Int. J. Comput. Vis. 123(1), 4\u201331 (2017). www.visualqa.org","journal-title":"Int. J. Comput. Vis."},{"key":"26_CR2","doi-asserted-by":"crossref","unstructured":"Anderson, P., et al.: Bottom-up and top-down attention for image captioning and visual question answering. In: CVPR (2018)","DOI":"10.1109\/CVPR.2018.00636"},{"key":"26_CR3","unstructured":"Arora, S., Liang, Y., Ma, T.: A simple but tough-to-beat baseline for sentence embeddings. In: ICLR (2017)"},{"key":"26_CR4","doi-asserted-by":"crossref","unstructured":"Azab, M., Wang, M., Smith, M., Kojima, N., Deng, J., Mihalcea, R.: Speaker naming in movies. In: NAACL-HLT, pp. 2206\u20132216 (2018)","DOI":"10.18653\/v1\/N18-1200"},{"key":"26_CR5","unstructured":"Bello, I., Zoph, B., Vasudevan, V., Le, Q.V.: Neural optimizer search with reinforcement learning. In: ICML, pp. 459\u2013468 (2017)"},{"key":"26_CR6","doi-asserted-by":"crossref","unstructured":"Fukui, A., Park, D.H., Yang, D., Rohrbach, A., Darrell, T., Rohrbach, M.: Multimodal compact bilinear pooling for visual question answering and visual grounding. In: EMNLP, pp. 457\u2013468 (2016)","DOI":"10.18653\/v1\/D16-1044"},{"key":"26_CR7","unstructured":"Glorot, X., Bengio, Y.: Understanding the difficulty of training deep feedforward neural networks. In: AISTATS, pp. 249\u2013256 (2010)"},{"key":"26_CR8","doi-asserted-by":"crossref","unstructured":"Goyal, Y., Khot, T., Summers-Stay, D., Batra, D., Parikh, D.: Making the V in VQA matter: elevating the role of image understanding in Visual Question Answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.670"},{"key":"26_CR9","doi-asserted-by":"crossref","unstructured":"He, K., Zhang, X., Ren, S., Sun, J.: Deep residual learning for image recognition. arXiv preprint arXiv:1512.03385 (2015)","DOI":"10.1109\/CVPR.2016.90"},{"issue":"8","key":"26_CR10","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter, S., Schmidhuber, J.: Long short-term memory. Neural Comput. 9(8), 1735\u20131780 (1997)","journal-title":"Neural Comput."},{"key":"26_CR11","unstructured":"Hu, H., Chao, W.L., Sha, F.: Learning answer embeddings for visual question answering. In: CVPR (2018)"},{"key":"26_CR12","doi-asserted-by":"crossref","unstructured":"Huang, J., et al.: Speed\/accuracy trade-offs for modern convolutional object detectors. In: CVPR, pp. 3296\u20133297 (2017)","DOI":"10.1109\/CVPR.2017.351"},{"key":"26_CR13","doi-asserted-by":"crossref","unstructured":"Jang, Y., Song, Y., Yu, Y., Kim, Y., Kim, G.: TGIF-QA: toward spatio-temporal reasoning in visual question answering. In: CVPR (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"26_CR14","doi-asserted-by":"crossref","unstructured":"Johnson, J., et al.: Inferring and executing programs for visual reasoning. In: ICCV, pp. 3008\u20133017 (2017)","DOI":"10.1109\/ICCV.2017.325"},{"key":"26_CR15","unstructured":"Kazemi, V., Elqursh, A.: Show, ask, attend, and answer: a strong baseline for visual question answering. CoRR abs\/1704.03162 (2017)"},{"key":"26_CR16","doi-asserted-by":"crossref","unstructured":"Kim, K., Heo, M., Choi, S., Zhang, B.: DeepStory: video story QA by deep embedded memory networks. In: IJCAI, pp. 2016\u20132022 (2017)","DOI":"10.24963\/ijcai.2017\/280"},{"key":"26_CR17","series-title":"Lecture Notes in Computer Science","doi-asserted-by":"publisher","first-page":"740","DOI":"10.1007\/978-3-319-10602-1_48","volume-title":"Computer Vision \u2013 ECCV 2014","author":"T-Y Lin","year":"2014","unstructured":"Lin, T.-Y., et al.: Microsoft COCO: common objects in context. In: Fleet, D., Pajdla, T., Schiele, B., Tuytelaars, T. (eds.) ECCV 2014. LNCS, vol. 8693, pp. 740\u2013755. Springer, Cham (2014). https:\/\/doi.org\/10.1007\/978-3-319-10602-1_48"},{"key":"26_CR18","doi-asserted-by":"crossref","unstructured":"Liu, F., Perez, J.: Gated end-to-end memory networks. In: EACL (2017)","DOI":"10.18653\/v1\/E17-1001"},{"key":"26_CR19","unstructured":"Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: NIPS, pp. 289\u2013297 (2016)"},{"key":"26_CR20","unstructured":"Mikolov, T., Chen, K., Corrado, G., Dean, J.: Efficient estimation of word representations in vector space. CoRR (2013)"},{"key":"26_CR21","doi-asserted-by":"crossref","unstructured":"Miller, A.H., Fisch, A., Dodge, J., Karimi, A., Bordes, A., Weston, J.: Key-value memory networks for directly reading documents. In: EMNLP, pp. 1400\u20131409 (2016)","DOI":"10.18653\/v1\/D16-1147"},{"key":"26_CR22","doi-asserted-by":"crossref","unstructured":"Mun, J., Seo, P.H., Jung, I., Han, B.: MarioQA: answering questions by watching gameplay videos. In: ICCV, pp. 2886\u20132894 (2017)","DOI":"10.1109\/ICCV.2017.312"},{"key":"26_CR23","doi-asserted-by":"crossref","unstructured":"Na, S., Lee, S., Kim, J., Kim, G.: A read-write memory network for movie story understanding. In: ICCV (2017)","DOI":"10.1109\/ICCV.2017.80"},{"key":"26_CR24","doi-asserted-by":"crossref","unstructured":"Pennington, J., Socher, R., Manning, C.D.: Glove: global vectors for word representation. In: EMNLP, pp. 1532\u20131543 (2014)","DOI":"10.3115\/v1\/D14-1162"},{"key":"26_CR25","doi-asserted-by":"crossref","unstructured":"Rajpurkar, P., Zhang, J., Lopyrev, K., Liang, P.: Squad: 100, 000+ questions for machine comprehension of text. In: EMNLP, pp. 2383\u20132392 (2016)","DOI":"10.18653\/v1\/D16-1264"},{"issue":"6","key":"26_CR26","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren, S., He, K., Girshick, R.B., Sun, J.: Faster R-CNN: towards real-time object detection with region proposal networks. IEEE Trans. Pattern Anal. Mach. Intell. 39(6), 1137\u20131149 (2017)","journal-title":"IEEE Trans. Pattern Anal. Mach. Intell."},{"key":"26_CR27","doi-asserted-by":"crossref","unstructured":"Rohrbach, A., et al.: Movie description. Int. J. Comput. Vis. (2017)","DOI":"10.1007\/s11263-016-0987-1"},{"key":"26_CR28","unstructured":"Simonyan, K., Zisserman, A.: Very deep convolutional networks for large-scale image recognition. In: ICLR (2015)"},{"key":"26_CR29","unstructured":"Sukhbaatar, S., Szlam, A., Weston, J., Fergus, R.: End-to-end memory networks. In: NIPS (2015)"},{"key":"26_CR30","doi-asserted-by":"crossref","unstructured":"Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: MovieQA: understanding stories in movies through question-answering. In: CVPR (2016)","DOI":"10.1109\/CVPR.2016.501"},{"key":"26_CR31","doi-asserted-by":"crossref","unstructured":"Wang, B., Xu, Y., Han, Y., Hong, R.: Movie question answering: remembering the textual cues for layered visual contents. In: AAAI (2018)","DOI":"10.1609\/aaai.v32i1.12253"},{"key":"26_CR32","unstructured":"Weston, J., Bordes, A., Chopra, S., Mikolov, T.: Towards AI-complete question answering: a set of prerequisite toy tasks. CoRR abs\/1502.05698 (2015)"},{"key":"26_CR33","unstructured":"Weston, J., Chopra, S., Bordes, A.: Memory networks. CoRR abs\/1410.3916 (2014)"},{"key":"26_CR34","doi-asserted-by":"publisher","first-page":"21","DOI":"10.1016\/j.cviu.2017.05.001","volume":"163","author":"Q Wu","year":"2017","unstructured":"Wu, Q., Teney, D., Wang, P., Shen, C., Dick, A.R., van den Hengel, A.: Visual question answering: a survey of methods and datasets. Comput. Vis. Image Underst. 163, 21\u201340 (2017)","journal-title":"Comput. Vis. Image Underst."}],"container-title":["Lecture Notes in Computer Science","Computer Vision \u2013 ACCV 2018"],"original-title":[],"language":"en","link":[{"URL":"http:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-030-20876-9_26","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2022,9,18]],"date-time":"2022-09-18T17:05:47Z","timestamp":1663520747000},"score":1,"resource":{"primary":{"URL":"http:\/\/link.springer.com\/10.1007\/978-3-030-20876-9_26"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2019]]},"ISBN":["9783030208752","9783030208769"],"references-count":34,"URL":"https:\/\/doi.org\/10.1007\/978-3-030-20876-9_26","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2019]]},"assertion":[{"value":"26 May 2019","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"ACCV","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Asian Conference on Computer Vision","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Perth, WA","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Australia","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2018","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 December 2018","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"6 December 2018","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"14","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"accv2018","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/accv2018.net\/","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"Microsoft CMT","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"979","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"274","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"28% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"3","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"2.7","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}},{"value":"No","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information"}}]}}