{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,3,25]],"date-time":"2025-03-25T17:05:33Z","timestamp":1742922333517,"version":"3.40.3"},"publisher-location":"Singapore","reference-count":29,"publisher":"Springer Nature Singapore","isbn-type":[{"type":"print","value":"9789819794362"},{"type":"electronic","value":"9789819794379"}],"license":[{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2024,11,1]],"date-time":"2024-11-01T00:00:00Z","timestamp":1730419200000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":[],"published-print":{"date-parts":[[2025]]},"DOI":"10.1007\/978-981-97-9437-9_33","type":"book-chapter","created":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:27:55Z","timestamp":1730392075000},"page":"420-432","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":0,"title":["DFS-QA: Dynamic Frame Selection for\u00a0Better Video Question Answering"],"prefix":"10.1007","author":[{"given":"Zhibo","family":"Ren","sequence":"first","affiliation":[]},{"given":"Baoyu","family":"Hou","sequence":"additional","affiliation":[]},{"given":"Huizhen","family":"Wang","sequence":"additional","affiliation":[]},{"given":"Muhua","family":"Zhu","sequence":"additional","affiliation":[]},{"given":"Tong","family":"Xiao","sequence":"additional","affiliation":[]},{"given":"Jingbo","family":"Zhu","sequence":"additional","affiliation":[]}],"member":"297","published-online":{"date-parts":[[2024,11,1]]},"reference":[{"key":"33_CR1","doi-asserted-by":"crossref","unstructured":"Yu, Z., \u00a0Yu, J., \u00a0Cui, Y., \u00a0Tao, D., \u00a0Tian, Q.: Deep modular co-attention networks for visual question answering. In: Proceedings of CVPR, pp. 6281\u20136290 (2019)","DOI":"10.1109\/CVPR.2019.00644"},{"key":"33_CR2","unstructured":"Vaswani, A., et al.: Attention is all you need. In: Proceedings of NIPS, pp. 5998\u20136008 (2017)"},{"key":"33_CR3","unstructured":"Devlin, J., Chang, M.-W., \u00a0Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: Proceedings of NAACL, pp. 4171\u20134186 (2019)"},{"key":"33_CR4","unstructured":"Ilse, M., Tomczak, J.M., \u00a0Welling, M.: Attention-based deep multiple instance learning. In: Proceedings of ICML, 2018, pp. 2127\u20132136 (2018)"},{"key":"33_CR5","doi-asserted-by":"crossref","unstructured":"Xie, S., \u00a0Sun, C., \u00a0Huang, J., \u00a0Tu, Z., \u00a0Murphy, K.: Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In: Proceedings of CVPR, pp. 305\u2013321 (2018)","DOI":"10.1007\/978-3-030-01267-0_19"},{"key":"33_CR6","doi-asserted-by":"crossref","unstructured":"Le, T.M., \u00a0Le, V., \u00a0Venkatesh, S., \u00a0Tran, T.: Hierarchical conditional relation networks for video question answering. In: Proceedings of CVPR, pp. 9972\u20139981 (2020)","DOI":"10.1109\/CVPR42600.2020.00999"},{"key":"33_CR7","unstructured":"Jiang, P., \u00a0Han, Y.: Reasoning with heterogeneous graph alignment for video question answering. In: Proceedings of AAAI, pp. 11\u00a0109\u201311\u00a0116 (2020)"},{"key":"33_CR8","doi-asserted-by":"crossref","unstructured":"Gao, J., \u00a0Ge, R., \u00a0Chen, K., \u00a0Nevatia, R.: Motion-appearance co-memory networks for video question answering. In: Proceedings of CVPR, pp. 6576\u20136585 (2018)","DOI":"10.1109\/CVPR.2018.00688"},{"key":"33_CR9","doi-asserted-by":"crossref","unstructured":"Fan, C., Zhang, X.,\u00a0Zhang, S., \u00a0Wang, W., \u00a0Zhang, C., Huang, H.: Heterogeneous memory enhanced multimodal attention model for video question answering. In: Proceedings of CVPR, pp. 1999\u20132007 (2019)","DOI":"10.1109\/CVPR.2019.00210"},{"key":"33_CR10","doi-asserted-by":"crossref","unstructured":"Buch, S., \u00a0Eyzaguirre, C., \u00a0Gaidon, A., \u00a0Wu, J., \u00a0Fei-Fei, L., Niebles, J.C.: Revisiting the \u201cvideo\u201d in video-language understanding. In: Proceedings of CVPR, pp. 2907\u20132917 (2022)","DOI":"10.1109\/CVPR52688.2022.00293"},{"key":"33_CR11","doi-asserted-by":"crossref","unstructured":"Xiao, J., \u00a0Zhou, P., Chua, T.-S., \u00a0Yan, S.: Video graph transformer for video question answering. In: Proceedings of ECCV, 2022, pp. 39\u201358 (2022)","DOI":"10.1007\/978-3-031-20059-5_3"},{"key":"33_CR12","doi-asserted-by":"crossref","unstructured":"Jang, Y., \u00a0Song, Y., \u00a0Yu, Y., \u00a0Kim, Y., \u00a0Kim, G.: TGIF-QA: toward spatio-temporal reasoning in visual question answering. In: Proceedings of CVPR, pp. 2758\u20132766 (2017)","DOI":"10.1109\/CVPR.2017.149"},{"key":"33_CR13","doi-asserted-by":"crossref","unstructured":"Xiao, J., \u00a0Shang, X., \u00a0Yao, A., Chua, T.-S.: NExT-QA: next phase of question-answering to explaining temporal actions. In: Proceedings of CVPR, pp. 9777\u20139786 (2021)","DOI":"10.1109\/CVPR46437.2021.00965"},{"key":"33_CR14","doi-asserted-by":"crossref","unstructured":"Xu, D., et al.: Video question answering via gradually refined attention over appearance and motion. In: Proceedings of ACM Multimedia, pp. 1645\u20131653 (2017)","DOI":"10.1145\/3123266.3123427"},{"key":"33_CR15","unstructured":"Chen, D.L., Dolan, W.B.: Collecting highly parallel data for paraphrase evaluation. In: Proceedings of ACL, pp. 190\u2013200 (2011)"},{"key":"33_CR16","doi-asserted-by":"crossref","unstructured":"Xiao, J., \u00a0Yao, A., \u00a0Liu, Z., \u00a0Li, Y., \u00a0Ji, W., \u00a0Chua, T.: Video as conditional graph hierarchy for multi-granular question answering. In: Proceedings of AAAI, pp. 2804\u20132812 (2022)","DOI":"10.1609\/aaai.v36i3.20184"},{"key":"33_CR17","doi-asserted-by":"crossref","unstructured":"Abdessaied, A., \u00a0Sood, E., Bulling, A.: Video language co-attention with multimodal fast-learning feature fusion for videoqa. In: Proceedings of ACL, pp. 143\u2013155 (2022)","DOI":"10.18653\/v1\/2022.repl4nlp-1.15"},{"key":"33_CR18","doi-asserted-by":"crossref","unstructured":"Li, Y., \u00a0Wang, X., \u00a0Xiao, J., \u00a0Ji, W., \u00a0Chua, T.: Invariant grounding for video question answering. In: Proceedings of CVPR, pp. 2918\u20132927 (2022)","DOI":"10.1109\/CVPR52688.2022.00294"},{"key":"33_CR19","doi-asserted-by":"crossref","unstructured":"Dang, L.H., Le, T.M., \u00a0Le, V., Tran, T.: Hierarchical object-oriented spatio-temporal reasoning for video question answering. In: Proceedings of IJCAI, pp. 636\u2013642 (2021)","DOI":"10.24963\/ijcai.2021\/88"},{"key":"33_CR20","doi-asserted-by":"crossref","unstructured":"Peng, L., Yang, S., \u00a0Bin, Y., Wang, G.: Progressive graph attention network for video question answering. In: Proceedings of ACM Multimedia, pp. 2871\u20132879 (2021)","DOI":"10.1145\/3474085.3475193"},{"key":"33_CR21","unstructured":"Huang, D., Chen, P., \u00a0Zeng, R., \u00a0Du, Q., \u00a0Tan, M., Gan, C.: Location-aware graph convolutional networks for video question answering. In: Proceedings of AAAI, pp. 11\u00a0021\u201311\u00a0028 (2020)"},{"key":"33_CR22","doi-asserted-by":"crossref","unstructured":"Park, J., \u00a0Lee, J., Sohn, K.: Bridge to answer: structure-aware graph interaction network for video question answering. In: Proceedings of CVPR, pp. 15\u00a0526\u201315\u00a0535 (2021)","DOI":"10.1109\/CVPR46437.2021.01527"},{"key":"33_CR23","unstructured":"Dosovitskiy, A., et al.: An image is worth 16x16 words: Transformers for image recognition at scale. In: Proceedings of ICLR (2021)"},{"key":"33_CR24","doi-asserted-by":"crossref","unstructured":"Wu, J., \u00a0Yu, Y., \u00a0Huang, C., \u00a0Yu, K.: Deep multiple instance learning for image classification and auto-annotation. In: Proceedings of CVPR, pp. 3460\u20133469 (2015)","DOI":"10.1109\/CVPR.2015.7298968"},{"key":"33_CR25","doi-asserted-by":"crossref","unstructured":"Xu, J., \u00a0Mei, T., \u00a0Yao, T., \u00a0Rui, Y.: MSR-VTT: a large video description dataset for bridging video and language. In: Proceedings of CVPR, pp. 5288\u20135296 (2016)","DOI":"10.1109\/CVPR.2016.571"},{"key":"33_CR26","doi-asserted-by":"crossref","unstructured":"Seo, A., \u00a0Kang, G., \u00a0Park, J., \u00a0Zhang, B.: Attend what you need: Motion-appearance synergistic networks for video question answering. In: Proceedings of ACL, pp. 6167\u20136177 (2021)","DOI":"10.18653\/v1\/2021.acl-long.481"},{"key":"33_CR27","unstructured":"Jang, E., \u00a0Gu, S., \u00a0Poole, B.: Categorical reparameterization with gumbel-softmax. In: Proceedings of ICLR (2017)"},{"key":"33_CR28","unstructured":"Kingma, D., \u00a0Ba, J.: Adam: a method for stochastic optimization. Comput. Sci. (2014)"},{"key":"33_CR29","doi-asserted-by":"crossref","unstructured":"Meng, Y., et al.: Ar-net: adaptive frame resolution for efficient action recognition. In: European Conference on Computer Vision, pp. 86\u2013104.\u00a0Springer (2020)","DOI":"10.1007\/978-3-030-58571-6_6"}],"container-title":["Lecture Notes in Computer Science","Natural Language Processing and Chinese Computing"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/978-981-97-9437-9_33","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,10,31]],"date-time":"2024-10-31T16:33:47Z","timestamp":1730392427000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/978-981-97-9437-9_33"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2024,11,1]]},"ISBN":["9789819794362","9789819794379"],"references-count":29,"URL":"https:\/\/doi.org\/10.1007\/978-981-97-9437-9_33","relation":{},"ISSN":["0302-9743","1611-3349"],"issn-type":[{"type":"print","value":"0302-9743"},{"type":"electronic","value":"1611-3349"}],"subject":[],"published":{"date-parts":[[2024,11,1]]},"assertion":[{"value":"1 November 2024","order":1,"name":"first_online","label":"First Online","group":{"name":"ChapterHistory","label":"Chapter History"}},{"value":"NLPCC","order":1,"name":"conference_acronym","label":"Conference Acronym","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"CCF International Conference on Natural Language Processing and Chinese Computing","order":2,"name":"conference_name","label":"Conference Name","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"Hangzhou","order":3,"name":"conference_city","label":"Conference City","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"China","order":4,"name":"conference_country","label":"Conference Country","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2024","order":5,"name":"conference_year","label":"Conference Year","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"2 November 2024","order":7,"name":"conference_start_date","label":"Conference Start Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"4 November 2024","order":8,"name":"conference_end_date","label":"Conference End Date","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"13","order":9,"name":"conference_number","label":"Conference Number","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"nlpcc2024","order":10,"name":"conference_id","label":"Conference ID","group":{"name":"ConferenceInfo","label":"Conference Information"}},{"value":"http:\/\/tcci.ccf.org.cn\/conference\/2024\/index.php","order":11,"name":"conference_url","label":"Conference URL","group":{"name":"ConferenceInfo","label":"Conference Information"}}]}}