{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2025,6,18]],"date-time":"2025-06-18T04:18:58Z","timestamp":1750220338121,"version":"3.41.0"},"publisher-location":"New York, NY, USA","reference-count":29,"publisher":"ACM","license":[{"start":{"date-parts":[[2021,12,4]],"date-time":"2021-12-04T00:00:00Z","timestamp":1638576000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.acm.org\/publications\/policies\/copyright_policy#Background"}],"content-domain":{"domain":["dl.acm.org"],"crossmark-restriction":true},"short-container-title":[],"published-print":{"date-parts":[[2021,12,4]]},"DOI":"10.1145\/3507548.3507567","type":"proceedings-article","created":{"date-parts":[[2022,3,9]],"date-time":"2022-03-09T23:12:40Z","timestamp":1646867560000},"page":"128-133","update-policy":"https:\/\/doi.org\/10.1145\/crossmark-policy","source":"Crossref","is-referenced-by-count":0,"title":["Keyword-aware Multi-modal Enhancement Attention for Video Question Answering"],"prefix":"10.1145","author":[{"given":"Duo","family":"Chen","sequence":"first","affiliation":[{"name":"Sun Yat-sen University,School of Computer Science and Engineering,National Engineering Research Center of Digital Life, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Fuwei","family":"Zhang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University,School of Computer Science and Engineering,National Engineering Research Center of Digital Life, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Shirou","family":"Ou","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University,School of Computer Science and Engineering,National Engineering Research Center of Digital Life, China"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Ruomei","family":"Wang","sequence":"additional","affiliation":[{"name":"Sun Yat-sen University,School of Computer Science and Engineering,National Engineering Research Center of Digital Life, China"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"320","published-online":{"date-parts":[[2022,3,9]]},"reference":[{"key":"e_1_3_2_1_1_1","first-page":"4733","volume-title":"Zisserman","author":"Carreira J.","unstructured":"Carreira , J. , Zisserman , A. : Quo vadis, action recognition? A new model and the kinetics dataset. In : CVPR. pp. 4724\u2013 4733 . IEEE Computer Society (2017). Carreira, J., Zisserman, A.: Quo vadis, action recognition? A new model and the kinetics dataset. In: CVPR. pp. 4724\u20134733. IEEE Computer Society (2017)."},{"key":"e_1_3_2_1_2_1","first-page":"4358","volume-title":"Mihalcea","author":"Castro S.","unstructured":"Castro , S. , Azab , M. , Stroud , J.C. , Noujaim , C. , Wang , R. , Deng , J. , Mihalcea , R. : Lifeqa: A real-life dataset for video question answering. In : LREC. pp. 4352\u2013 4358 . European Language Resources Association (2020). Castro, S., Azab, M., Stroud, J.C., Noujaim, C., Wang, R., Deng, J., Mihalcea, R.: Lifeqa: A real-life dataset for video question answering. In: LREC. pp. 4352\u20134358. European Language Resources Association (2020)."},{"key":"e_1_3_2_1_3_1","first-page":"4186","volume-title":"Toutanova","author":"Devlin J.","unstructured":"Devlin , J. , Chang , M. , Lee , K. , Toutanova , K. : BERT : pre-training of deep bidirectional transformers for language understanding. In : NAACL-HLT ( 1). pp. 4171\u2013 4186 . Association for Computational Linguistics (2019). Devlin, J., Chang, M., Lee, K., Toutanova, K.: BERT: pre-training of deep bidirectional transformers for language understanding. In: NAACL-HLT (1). pp. 4171\u2013 4186. Association for Computational Linguistics (2019)."},{"key":"e_1_3_2_1_4_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-58523-5_34"},{"key":"e_1_3_2_1_5_1","volume-title":"Event detection in soccer video based on audio\/visual keywords. masters","author":"Kang Y.","year":"2004","unstructured":"Kang , Y. : Event detection in soccer video based on audio\/visual keywords. masters ( 2004 ). Kang, Y.: Event detection in soccer video based on audio\/visual keywords. masters (2004)."},{"key":"e_1_3_2_1_6_1","first-page":"10112","volume-title":"Yoo","author":"Kim J.","unstructured":"Kim , J. , Ma , M. , Pham , T.X. , Kim , K. , Yoo , C.D. : Modality shifting attention network for multi-modal video question answering. In : CVPR. pp. 10103\u2013 10112 . IEEE (2020). Kim, J., Ma, M., Pham, T.X., Kim, K., Yoo, C.D.: Modality shifting attention network for multi-modal video question answering. In: CVPR. pp. 10103\u201310112. IEEE (2020)."},{"key":"e_1_3_2_1_7_1","first-page":"2022","volume-title":"Zhang","author":"Kim K.","unstructured":"Kim , K. , Heo , M. , Choi , S. , Zhang , B. : Deepstory: Video story QA by deep embedded memory networks. In : IJCAI. pp. 2016\u2013 2022 . ijcai.org (2017). Kim, K., Heo, M., Choi, S., Zhang, B.: Deepstory: Video story QA by deep embedded memory networks. In: IJCAI. pp. 2016\u20132022. ijcai.org (2017)."},{"key":"e_1_3_2_1_8_1","volume-title":"Ba","author":"Kingma D.P.","year":"2015","unstructured":"Kingma , D.P. , Ba , J. : Adam: A method for stochastic optimization. In : ICLR (Poster) ( 2015 ). Kingma, D.P., Ba, J.: Adam: A method for stochastic optimization. In: ICLR (Poster) (2015)."},{"key":"e_1_3_2_1_9_1","doi-asserted-by":"publisher","DOI":"10.1007\/s11263-016-0981-7"},{"key":"e_1_3_2_1_10_1","first-page":"8225","volume-title":"Bansal, M.: TVQA+: spatio-temporal grounding for video question answering. In: ACL.","author":"Lei J.","unstructured":"Lei , J. , Yu , L. , Berg , T.L. , Bansal, M.: TVQA+: spatio-temporal grounding for video question answering. In: ACL. pp. 8211\u2013 8225 . Association for Computational Linguistics (2020). Lei, J., Yu, L., Berg, T.L., Bansal, M.: TVQA+: spatio-temporal grounding for video question answering. In: ACL. pp. 8211\u20138225. Association for Computational Linguistics (2020)."},{"key":"e_1_3_2_1_11_1","volume-title":"of","author":"Juanzi FAN","year":"2007","unstructured":"LI, Juanzi , FAN , Qi'na , ZHANG , Kuo , Department , of , Computer, Science : Keyword extraction based on tf\/idf for chinese news document. Wuhan University Journal of Natural Sciences ( 2007 ). LI, Juanzi, FAN, Qi'na, ZHANG, Kuo, Department, of, Computer, Science: Keyword extraction based on tf\/idf for chinese news document. Wuhan University Journal of Natural Sciences (2007)."},{"key":"e_1_3_2_1_12_1","first-page":"7","volume-title":"Lovell","author":"Liu L.","unstructured":"Liu , L. , Wiliem , A. , Chen , S. , Zhao , K. , Lovell , B.C. : Determining the best attributes for surveillance video keywords generation. In : ISBA. pp. 1\u2013 7 . IEEE (2016). Liu, L., Wiliem, A., Chen, S., Zhao, K., Lovell, B.C.: Determining the best attributes for surveillance video keywords generation. In: ISBA. pp. 1\u20137. IEEE (2016)."},{"key":"e_1_3_2_1_13_1","first-page":"297","volume-title":"Parikh","author":"Lu J.","year":"2016","unstructured":"Lu , J. , Yang , J. , Batra , D. , Parikh , D. : Hierarchical question-image co-attention for visual question answering. In : NIPS. pp. 289\u2013 297 ( 2016 ). Lu, J., Yang, J., Batra, D., Parikh, D.: Hierarchical question-image co-attention for visual question answering. In: NIPS. pp. 289\u2013297 (2016)."},{"key":"e_1_3_2_1_14_1","doi-asserted-by":"publisher","DOI":"10.18178\/joig.6.2.174-180"},{"key":"e_1_3_2_1_15_1","doi-asserted-by":"publisher","DOI":"10.1109\/ICDH51081.2020.00055"},{"key":"e_1_3_2_1_16_1","volume-title":"Using tf-idf to determine word relevance in document queries","author":"Ramos J.","year":"2003","unstructured":"Ramos , J. : Using tf-idf to determine word relevance in document queries ( 2003 ). Ramos, J.: Using tf-idf to determine word relevance in document queries (2003)."},{"key":"e_1_3_2_1_17_1","volume-title":"Hajishirzi","author":"Seo M.J.","year":"2017","unstructured":"Seo , M.J. , Kembhavi , A. , Farhadi , A. , Hajishirzi , H. : Bidirectional attention flow for machine comprehension. In : ICLR (Poster). OpenReview .net ( 2017 ). Seo, M.J., Kembhavi, A., Farhadi, A., Hajishirzi, H.: Bidirectional attention flow for machine comprehension. In: ICLR (Poster). OpenReview.net (2017)."},{"key":"e_1_3_2_1_18_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.eswa.2018.08.015"},{"key":"e_1_3_2_1_19_1","doi-asserted-by":"publisher","DOI":"10.1007\/978-3-030-01225-0_32"},{"key":"e_1_3_2_1_20_1","doi-asserted-by":"publisher","DOI":"10.18178\/joig.3.2.90-95"},{"key":"e_1_3_2_1_21_1","first-page":"4640","volume-title":"Fidler","author":"Tapaswi M.","unstructured":"Tapaswi , M. , Zhu , Y. , Stiefelhagen , R. , Torralba , A. , Urtasun , R. , Fidler , S. : Movieqa: Understanding stories in movies through question-answering. In : CVPR. pp. 4631\u2013 4640 . IEEE Computer Society (2016). Tapaswi, M., Zhu, Y., Stiefelhagen, R., Torralba, A., Urtasun, R., Fidler, S.: Movieqa: Understanding stories in movies through question-answering. In: CVPR. pp. 4631\u20134640. IEEE Computer Society (2016)."},{"key":"e_1_3_2_1_22_1","first-page":"4497","volume-title":"Paluri","author":"Tran D.","unstructured":"Tran , D. , Bourdev , L.D. , Fergus , R. , Torresani , L. , Paluri , M. : Learning spatiotemporal features with 3d convolutional networks. In : ICCV. pp. 4489\u2013 4497 . IEEE Computer Society (2015). Tran, D., Bourdev, L.D., Fergus, R., Torresani, L., Paluri, M.: Learning spatiotemporal features with 3d convolutional networks. In: ICCV. pp. 4489\u20134497. IEEE Computer Society (2015)."},{"key":"e_1_3_2_1_23_1","first-page":"6008","volume-title":"Polosukhin","author":"Vaswani A.","year":"2017","unstructured":"Vaswani , A. , Shazeer , N. , Parmar , N. , Uszkoreit , J. , Jones , L. , Gomez , A.N. , Kaiser , L. , Polosukhin , I. : Attention is all you need. In : NIPS. pp. 5998\u2013 6008 ( 2017 ). Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A.N., Kaiser, L., Polosukhin, I.: Attention is all you need. In: NIPS. pp. 5998\u20136008 (2017)."},{"key":"e_1_3_2_1_24_1","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107248"},{"key":"e_1_3_2_1_25_1","first-page":"1424","volume-title":"Chang","author":"Welch M.J.","unstructured":"Welch , M.J. , Cho , J. , Chang , W. : Generating advertising keywords from video content. In : CIKM. pp. 1421\u2013 1424 . ACM (2010). Welch, M.J., Cho, J., Chang, W.: Generating advertising keywords from video content. In: CIKM. pp. 1421\u20131424. ACM (2010)."},{"key":"e_1_3_2_1_26_1","first-page":"1554","volume-title":"Takemura","author":"Yang Z.","unstructured":"Yang , Z. , Garcia , N. , Chu , C. , Otani , M. , Nakashima , Y. , Takemura , H. : BERT representations for video question answering. In : WACV. pp. 1545\u2013 1554 . IEEE (2020). Yang, Z., Garcia, N., Chu, C., Otani, M., Nakashima, Y., Takemura, H.: BERT representations for video question answering. In: WACV. pp. 1545\u20131554. IEEE (2020)."},{"key":"e_1_3_2_1_27_1","first-page":"29","volume-title":"Smola","author":"Yang Z.","unstructured":"Yang , Z. , He , X. , Gao , J. , Deng , L. , Smola , A.J. : Stacked attention networks for image question answering. In : CVPR. pp. 21\u2013 29 . IEEE Computer Society (2016). Yang, Z., He, X., Gao, J., Deng, L., Smola, A.J.: Stacked attention networks for image question answering. In: CVPR. pp. 21\u201329. IEEE Computer Society (2016)."},{"key":"e_1_3_2_1_28_1","first-page":"4340","volume-title":"Sun","author":"Zeng K.","unstructured":"Zeng , K. , Chen , T. , Chuang , C. , Liao , Y. , Niebles , J.C. , Sun , M. : Leveraging video descriptions to learn video question answering. In : AAAI. pp. 4334\u2013 4340 . AAAI Press (2017). Zeng, K., Chen, T., Chuang, C., Liao, Y., Niebles, J.C., Sun, M.: Leveraging video descriptions to learn video question answering. In: AAAI. pp. 4334\u20134340. AAAI Press (2017)."},{"key":"e_1_3_2_1_29_1","first-page":"518","volume-title":"Radke","author":"Zhang L.","unstructured":"Zhang , L. , Radke , R.J. : Temporal attention and consistency measuring for video question answering. In : ICMI. pp. 510\u2013 518 . ACM (2020). Zhang, L., Radke, R.J.: Temporal attention and consistency measuring for video question answering. In: ICMI. pp. 510\u2013518. ACM (2020)."}],"event":{"name":"CSAI 2021: 2021 5th International Conference on Computer Science and Artificial Intelligence","acronym":"CSAI 2021","location":"Beijing China"},"container-title":["2021 5th International Conference on Computer Science and Artificial Intelligence"],"original-title":[],"link":[{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3507548.3507567","content-type":"unspecified","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/dl.acm.org\/doi\/pdf\/10.1145\/3507548.3507567","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2025,6,17]],"date-time":"2025-06-17T20:12:26Z","timestamp":1750191146000},"score":1,"resource":{"primary":{"URL":"https:\/\/dl.acm.org\/doi\/10.1145\/3507548.3507567"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2021,12,4]]},"references-count":29,"alternative-id":["10.1145\/3507548.3507567","10.1145\/3507548"],"URL":"https:\/\/doi.org\/10.1145\/3507548.3507567","relation":{},"subject":[],"published":{"date-parts":[[2021,12,4]]},"assertion":[{"value":"2022-03-09","order":2,"name":"published","label":"Published","group":{"name":"publication_history","label":"Publication History"}}]}}