{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T17:20:30Z","timestamp":1742923230380,"version":"3.40.3"},"publisher-location":"Cham","reference-count":37,"publisher":"Springer Nature Switzerland","isbn-type":[{"type":"print","value":"9783031533105"},{"type":"electronic","value":"9783031533112"}],"license":[{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,1,1]],"date-time":"2024-01-01T00:00:00Z","timestamp":1704067200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2024]]},"DOI":"10.1007\/978-3-031-53311-2_16","type":"book-chapter","created":{"date-parts":[[2024,1,27]],"date-time":"2024-01-27T21:37:36Z","timestamp":1706391456000},"page":"212-227","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["Appearance-Motion Dual-Stream Heterogeneous Network for\u00a0VideoQA"],"prefix":"10.1007","author":[{"given":"Feifei","family":"Xu","sequence":"first","affiliation":[]},{"given":"Zheng","family":"Zhong","sequence":"additional","affiliation":[]},{"given":"Yitao","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Yingchen","family":"Zhou","sequence":"additional","affiliation":[]},{"given":"Guangzhen","family":"Li","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,1,28]]},"reference":[{"key":"16_CR1","doi-asserted-by":"crossref","unstructured":"Yang, Z., et al.: Stacked attention networks for image question answering. In: Proceedings of CVPR (2016)","DOI":"10.1109\/CVPR.2016.10"},{"key":"16_CR2","doi-asserted-by":"crossref","unstructured":"Fan, C., et al.: Heterogeneous memory enhanced multimodal attention model for video question answering. In: Proceedings of CVPR (2019)","DOI":"10.1109\/CVPR.2019.00210"},{"key":"16_CR3","doi-asserted-by":"crossref","unstructured":"Xiao, J., et al. :NExT-QA: next phase of question-answering to explaining temporal actions. In: Proceedings of CVPR (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"16_CR4","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Cross-Attentional Spatio-Temporal semantic graph networks for video question answering. In: Proceedings of Image Processing (2022)","DOI":"10.1109\/TIP.2022.3142526"},{"key":"16_CR5","doi-asserted-by":"crossref","unstructured":"He, K., et al.: Deep residual learning for image recognition. In: Proceedings of CVPR (2016)","DOI":"10.1109\/CVPR.2016.90"},{"key":"16_CR6","doi-asserted-by":"crossref","unstructured":"Hara, K., et al.: Can spatiotemporal 3D CNNs retrace the history of 2D CNNs and imageNet? In: Proceedings of CVPR (2017)","DOI":"10.1109\/CVPR.2018.00685"},{"key":"16_CR7","unstructured":"Kim, J-H., et al.: Bilinear Attention Networks. In: Proceedings of NeurIPS (2018)"},{"key":"16_CR8","doi-asserted-by":"crossref","unstructured":"Xu, D., et al.: Video question answering via gradually refined attention over appearance and motion. In: Proceedings of ACM (2017)","DOI":"10.1145\/3123266.3123427"},{"key":"16_CR9","doi-asserted-by":"crossref","unstructured":"Zhao, Z., et al.: Video question answering via hierarchical Dual-Level attention network learning. In: Proceedings of ACM (2017)","DOI":"10.1145\/3123266.3123364"},{"key":"16_CR10","unstructured":"Simonyan, K., Zisserman, A.:Very deep convolutional networks for Large-Scale image recognition. CoRR abs\/1409.1556 (2014)"},{"key":"16_CR11","doi-asserted-by":"crossref","unstructured":"Tran, D., et al.: Learning spatiotemporal features with 3D convolutional networks. In: Proceedings of ICCV (2015)","DOI":"10.1109\/ICCV.2015.510"},{"key":"16_CR12","doi-asserted-by":"crossref","unstructured":"Jang, Y., et al.: TGIF-QA: Toward Spatio-Temporal reasoning in visual question answering. In: Proceedings of CVPR (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"16_CR13","doi-asserted-by":"crossref","unstructured":"Jiang, J., et al.: Divide and conquer: Question-Guided Spatio-Temporal contextual attention for video question answering. In: Proceedings of AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6766"},{"key":"16_CR14","doi-asserted-by":"crossref","unstructured":"Gao, J. et al.: Motion-Appearance Co-memory networks for video question answering. In: Proceedings of CVPR (2018)","DOI":"10.1109\/CVPR.2018.00688"},{"key":"16_CR15","doi-asserted-by":"crossref","unstructured":"Huang, D., et al.: Location-Aware graph convolutional networks for video question answering. In: Proceedings of AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6737"},{"key":"16_CR16","doi-asserted-by":"crossref","unstructured":"Zeng, K-H et al.: Leveraging video descriptions to learn video question answering. In: Proceedings of AAAI (2016)","DOI":"10.1609\/aaai.v31i1.11238"},{"key":"16_CR17","doi-asserted-by":"crossref","unstructured":"Zhu, L., et al.: Uncovering the temporal context for video question answering. In: Proceedings of IJCV (2017)","DOI":"10.1007\/s11263-017-1033-7"},{"key":"16_CR18","doi-asserted-by":"crossref","unstructured":"Zhao, Z., et al.: Multi-Turn video question answering via Multi-Stream hierarchical attention context network. In: Proceedings of IJCAI (2018)","DOI":"10.24963\/ijcai.2018\/513"},{"key":"16_CR19","unstructured":"Ren, S., et al.: Faster R-CNN: towards Real-Time object detection with region proposal networks. In: Proceedings of TPAMI (2015)"},{"key":"16_CR20","unstructured":"Devlin, J., et al.: BERT: Pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL-HLT (2019)"},{"key":"16_CR21","doi-asserted-by":"crossref","unstructured":"He, K., et al.: Mask R-CNN. In: Proceedings of TPMAI (2017)","DOI":"10.1109\/ICCV.2017.322"},{"key":"16_CR22","doi-asserted-by":"crossref","unstructured":"Seo, A., et al.: Attend what you need: Motion-Appearance synergistic networks for video question answering. In: Proceedings of ACL (2021)","DOI":"10.18653\/v1\/2021.acl-long.481"},{"issue":"1","key":"16_CR23","doi-asserted-by":"publisher","first-page":"32","DOI":"10.1007\/s11263-016-0981-7","volume":"123","author":"R krishna","year":"2017","unstructured":"krishna, R., et al.: Visual Genome: connecting language and vision using crowdsourced dense image annotations. Int. J. Comput. Vis. 123(1), 32\u201373 (2017). https:\/\/doi.org\/10.1007\/s11263-016-0981-7","journal-title":"Int. J. Comput. Vis."},{"key":"16_CR24","doi-asserted-by":"crossref","unstructured":"Deng, J., et al. ImageNet: a large-scale hierarchical image database. In: Proceedings of CVPR (2009)","DOI":"10.1109\/CVPR.2009.5206848"},{"key":"16_CR25","unstructured":"Kay, W., et al.: The kinetics human action video dataset. ArXiv abs\/1705.06950 (2017)"},{"key":"16_CR26","unstructured":"Kingma, D., et al.: Adam: a method for stochastic optimization. CoRR abs\/1412.6980 (2014)"},{"key":"16_CR27","doi-asserted-by":"crossref","unstructured":"Ye, Y., et al.: Video question answering via Attribute-Augmented attention network learning. In: Proceedings of SIGIR (2017)","DOI":"10.1145\/3077136.3080655"},{"key":"16_CR28","unstructured":"Chen, D.L., William B.D.: Collecting Highly Parallel Data for Paraphrase Evaluation. In: Proceedings of ACL (2011)"},{"key":"16_CR29","doi-asserted-by":"crossref","unstructured":"Xu, J., et al.; MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of CVPR (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"16_CR30","doi-asserted-by":"crossref","unstructured":"Zha, Z., et al.: Spatiotemporal-Textual Co-Attention network for video question answering. In: Proceedings of TOMM (2019)","DOI":"10.1145\/3320061"},{"key":"16_CR31","doi-asserted-by":"crossref","unstructured":"Yu, T., et al.: Compositional attention networks with Two-Stream fusion for video question answering. In: Proceedings of TIP (2020)","DOI":"10.1109\/TIP.2019.2940677"},{"key":"16_CR32","doi-asserted-by":"crossref","unstructured":"Jiang, P., Han, Y.: Reasoning with heterogeneous graph alignment for video question Answering. In: Proceedings of AAAI (2020)","DOI":"10.1609\/aaai.v34i07.6767"},{"key":"16_CR33","doi-asserted-by":"crossref","unstructured":"Cai, J., et al.: Feature augmented memory with global attention network for VideoQA. In: Proceedings of IJCAI (2020)","DOI":"10.24963\/ijcai.2020\/139"},{"key":"16_CR34","doi-asserted-by":"crossref","unstructured":"Gu, M., et al.: Graph-Based Multi-Interaction network for video question answering. In: Proceedings of TIP(2021)","DOI":"10.1109\/TIP.2021.3051756"},{"key":"16_CR35","doi-asserted-by":"crossref","unstructured":"Abdessaied, A., et al.: Video language Co-Attention with multimodal Fast-Learning feature fusion for VideoQA. In: Proceedings of RepL4NLP (2022)","DOI":"10.18653\/v1\/2022.repl4nlp-1.15"},{"key":"16_CR36","doi-asserted-by":"crossref","unstructured":"Liu, Y., et al.: Dynamic self-attention with vision synchronization networks for video question answering. In: Proceedings of ICPR (2022)","DOI":"10.1016\/j.patcog.2022.108959"},{"key":"16_CR37","doi-asserted-by":"crossref","unstructured":"Li, X., et al.: Complementary spatiotemporal network for video question answering. In: Proceedings of Multimed Syst (2021)","DOI":"10.1007\/s00530-021-00805-6"}],"container-title":["Lecture Notes in Computer Science","MultiMedia Modeling"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-3-031-53311-2_16","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,3,12]],"date-time":"2024-03-12T15:26:25Z","timestamp":1710257185000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-3-031-53311-2_16"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024]]},"ISBN":["9783031533105","9783031533112"],"references-count":37,"URL":"https:\/\/doi.org\/10.1007\/978-3-031-53311-2_16","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024]]},"assertion":[{"value":"28 January 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"MMM","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"International Conference on Multimedia Modeling","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Amsterdam","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"The Netherlands","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"29 January 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 February 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"30","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"mmm2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Double-blind","order":1,"name":"type","label":"Type","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"ConfTool Pro","order":2,"name":"conference_management_system","label":"Conference Management System","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"297","order":3,"name":"number_of_submissions_sent_for_review","label":"Number of Submissions Sent for Review","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"112","order":4,"name":"number_of_full_papers_accepted","label":"Number of Full Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"0","order":5,"name":"number_of_short_papers_accepted","label":"Number of Short Papers Accepted","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"38% - The value is computed by the equation \"Number of Full Papers Accepted \/ Number of Submissions Sent for Review * 100\" and then rounded to a whole number.","order":6,"name":"acceptance_rate_of_full_papers","label":"Acceptance Rate of Full Papers","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":7,"name":"average_number_of_reviews_per_paper","label":"Average Number of Reviews per Paper","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"3.2","order":8,"name":"average_number_of_papers_per_reviewer","label":"Average Number of Papers per Reviewer","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}},{"value":"Yes","order":9,"name":"external_reviewers_involved","label":"External Reviewers Involved","group":{"name":"ConfEventPeerReviewInformation","label":"Peer Review Information (provided by the conference organizers)"}}]}}