{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2023,5,7]],"date-time":"2023-05-07T04:22:35Z","timestamp":1683433355518},"reference-count":40,"publisher":"Institute of Electronics, Information and Communications Engineers (IEICE)","issue":"5","content-domain":{"domain":[],"crossmark-restriction":false},"short-container-title":["IEICE Trans. Inf. &amp; Syst."],"published-print":{"date-parts":[[2023,5,1]]},"DOI":"10.1587\/transinf.2022dlp0002","type":"journal-article","created":{"date-parts":[[2023,4,30]],"date-time":"2023-04-30T22:23:41Z","timestamp":1682893421000},"page":"581-589","source":"Crossref","is-referenced-by-count":0,"title":["A Visual Question Answering Network Merging High- and Low-Level Semantic Information"],"prefix":"10.1587","volume":"E106.D","author":[{"given":"Huimin","family":"LI","sequence":"first","affiliation":[{"name":"Department of Computer, Shanghai Maritime University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dezhi","family":"HAN","sequence":"additional","affiliation":[{"name":"Department of Computer, Shanghai Maritime University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chongqing","family":"CHEN","sequence":"additional","affiliation":[{"name":"Department of Computer, Shanghai Maritime University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chin-Chen","family":"CHANG","sequence":"additional","affiliation":[{"name":"Department of Information Engineering and Computer Science, Feng Chia University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Kuan-Ching","family":"LI","sequence":"additional","affiliation":[{"name":"Dept. of Computer Science and Information Engineering, Providence University"}],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Dun","family":"LI","sequence":"additional","affiliation":[{"name":"Department of Computer, Shanghai Maritime University"}],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"532","reference":[{"key":"1","doi-asserted-by":"crossref","unstructured":"[1] H. Nam, J.W. Ha, and J. Kim, \u201cDual attention networks for multimodal reasoning and matching,\u201d CoRR, abs\/1611.00471, 2016.","DOI":"10.1109\/CVPR.2017.232"},{"key":"2","unstructured":"[2] J.H. Kim, J. Jun, and B.T. Zhang, \u201cBilinear attention networks,\u201d CoRR, abs\/1805.07932, 2018."},{"key":"3","doi-asserted-by":"crossref","unstructured":"[3] P. Anderson, X. He, C. Buehler, D. Teney, M. Johnson, S. Gould, and L. Zhang, \u201cBottom-up and top-down attention for image captioning and visual question answering,\u201d 2018 IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp.6077-6086, 2018. 10.1109\/cvpr.2018.00636","DOI":"10.1109\/CVPR.2018.00636"},{"key":"4","doi-asserted-by":"crossref","unstructured":"[4] J. Donahue, L.A. Hendricks, S. Guadarrama, M. Rohrbach, S. Venugopalan, K. Saenko, and T. Darrell, \u201cLong-term recurrent convolutional networks for visual recognition and description,\u201d CoRR, abs\/1411.4389, 2014.","DOI":"10.21236\/ADA623249"},{"key":"5","doi-asserted-by":"crossref","unstructured":"[5] A. Agrawal, J. Lu, S. Antol, M. Mitchell, C.L. Zitnick, D. Parikh, and D. Batra, \u201cVqa: Visual question answering,\u201d International Journal of Computer Vision, vol.123, pp.4-31, 2015.","DOI":"10.1007\/s11263-016-0966-6"},{"key":"6","doi-asserted-by":"publisher","unstructured":"[6] Z. Guo, D. Han, and K.-C. Li, \u201cDouble-layer affective visual question answering network,\u201d Comput. Sci. Inf. Syst., vol.18, pp.155-168, 2021. 10.2298\/csis200515038g","DOI":"10.2298\/CSIS200515038G"},{"key":"7","doi-asserted-by":"crossref","unstructured":"[7] D. Han, N. Pan, and K.-C. Li, \u201cA traceable and revocable ciphertext-policy attribute-based encryption scheme based on privacy protection,\u201d IEEE Transactions on Dependable and Secure Computing, pp.316-327, 2020. 10.1109\/tdsc.2020.2977646","DOI":"10.1109\/TDSC.2020.2977646"},{"key":"8","doi-asserted-by":"publisher","unstructured":"[8] A. Das, S. Kottur, K. Gupta, A. Singh, D. Yadav, S. Lee, J.M.F. Moura, D. Parikh, and D. Batra, \u201cVisual dialog.,\u201d IEEE Trans. Pattern Anal. Mach. Intell., vol.41, no.5, pp.1242-1256, 2019. 10.1109\/tpami.2018.2828437","DOI":"10.1109\/TPAMI.2018.2828437"},{"key":"9","doi-asserted-by":"crossref","unstructured":"[9] D. Yu, Q. Xu, H. Guo, C. Zhao, Y. Lin, and D. Li, \u201cAn efficient and lightweight convolutional neural network for remote sensing image scene classification,\u201d Sensors (Basel, Switzerland), vol.20, no.7, 2020. 10.3390\/s20071999","DOI":"10.3390\/s20071999"},{"key":"10","doi-asserted-by":"crossref","unstructured":"[10] Z. Yang, X. He, J. Gao, L. Deng, and A. Smola, \u201cStacked attention networks for image question answering,\u201d 2016 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), pp.21-29, 2016. 10.1109\/cvpr.2016.10","DOI":"10.1109\/CVPR.2016.10"},{"key":"11","doi-asserted-by":"crossref","unstructured":"[11] Z. Yu, J. Yu, Y. Cui, D. Tao, and Q. Tian, \u201cDeep modular co-attention networks for visual question answering,\u201d 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.6274-6283, 2019. 10.1109\/cvpr.2019.00644","DOI":"10.1109\/CVPR.2019.00644"},{"key":"12","doi-asserted-by":"crossref","unstructured":"[12] D.-K. Nguyen and T. Okatani, \u201cImproved fusion of visual and language representations by dense symmetric co-attention for visual question answering,\u201d Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp.6087-6096, 2018. 10.1109\/cvpr.2018.00637","DOI":"10.1109\/CVPR.2018.00637"},{"key":"13","unstructured":"[13] J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, \u201cBert: Pretraining of deep bidirectional transformers for language understanding,\u201d NAACL-HLT, 2019. 10.18653\/v1\/n19-1423"},{"key":"14","doi-asserted-by":"publisher","unstructured":"[14] Z. Yu, J. Yu, C. Xiang, J. Fan, and D. Tao, \u201cBeyond bilinear: Generalized multimodal factorized high-order pooling for visual question answering,\u201d IEEE transactions on neural networks and learning systems, vol.29, no.12, pp.5947-5959, 2018. 10.1109\/tnnls.2018.2817340","DOI":"10.1109\/TNNLS.2018.2817340"},{"key":"15","doi-asserted-by":"publisher","unstructured":"[15] H. Li and D. Han, \u201cEdurss: A blockchain-based educational records secure storage and sharing scheme,\u201d IEEE Access, vol.7, pp.179273-179289, 2019. 10.1109\/access.2019.2956157","DOI":"10.1109\/ACCESS.2019.2956157"},{"key":"16","doi-asserted-by":"publisher","unstructured":"[16] H. Liu, D. Han, and D. Li, \u201cFabric-iot: A blockchain-based access control system in iot,\u201d IEEE Access, vol.8, pp.18207-18218, 2020. 10.1109\/access.2020.2968492","DOI":"10.1109\/ACCESS.2020.2968492"},{"key":"17","doi-asserted-by":"crossref","unstructured":"[17] P. Gao, Z. Jiang, H. You, P. Lu, S.C.H. Hoi, X. Wang, and H. Li, \u201cDynamic fusion with intra- and inter-modality attention flow for visual question answering,\u201d 2019 IEEE\/CVF Conference on Computer Vision and Pattern Recognition (CVPR), pp.6632-6641, 2019. 10.1109\/cvpr.2019.00680","DOI":"10.1109\/CVPR.2019.00680"},{"key":"18","unstructured":"[18] P. Gao, H. Y ou, Z. Zhang, X. Wang, and H. Li, \u201cMulti-modality latent interaction network for visual question answering,\u201d 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.5824-5834, 2019. 10.1109\/iccv.2019.00592"},{"key":"19","unstructured":"[19] Z. Yu, Y. Cui, J. Yu, D. Tao, and Q. Tian, \u201cMultimodal unified attention networks for vision-and-language interactions,\u201d arXiv preprint arXiv:1908.04107, 2019."},{"key":"20","doi-asserted-by":"crossref","unstructured":"[20] E. Voita, P. Serdyukov, R. Sennrich, and I. Titov, \u201cContext-aware neural machine translation learns anaphora resolution,\u201d arXiv, abs\/1805.10163, 2018.","DOI":"10.18653\/v1\/P18-1117"},{"key":"21","unstructured":"[21] K. Simonyan and A. Zisserman, \u201cVery deep convolutional networks for large-scale image recognition,\u201d arXiv preprint arXiv:1409.1556, 2014."},{"key":"22","unstructured":"[22] B. Zhou, Y. Tian, S. Sukhbaatar, A. Szlam, and R. Fergus, \u201cSimple baseline for visual question answering,\u201d arXiv preprint arXiv:1512.02167, 2015."},{"key":"23","doi-asserted-by":"publisher","unstructured":"[23] D. Han, S. Zhou, K.C. Li, and R.F de Mello, \u201cCross-modality co-attention networks for visual question answering,\u201d Soft Comput., vol.25, pp.5411-5421, 2021. 10.1007\/s00500-020-05539-7","DOI":"10.1007\/s00500-020-05539-7"},{"key":"24","doi-asserted-by":"publisher","unstructured":"[24] M. Cui, D. Han, and J. Wang, \u201cAn efficient and safe road condition monitoring authentication scheme based on fog computing,\u201d IEEE Internet of Things Journal, vol.6, no.5, pp.9076-9084, 2019. 10.1109\/jiot.2019.2927497","DOI":"10.1109\/JIOT.2019.2927497"},{"key":"25","doi-asserted-by":"crossref","unstructured":"[25] A. Fukui, D.H. Park, D. Yang, A. Rohrbach, T. Darrell, and M. Rohrbach, \u201cMultimodal compact bilinear pooling for visual question answering and visual grounding,\u201d arXiv preprint arXiv:1606.01847, 2016.","DOI":"10.18653\/v1\/D16-1044"},{"key":"26","unstructured":"[26] K. Xu, J. Ba, R. Kiros, K. Cho, A.C. Courville, R. Salakhutdinov, R. Zemel, and Y. Bengio, \u201cShow, attend and tell: Neural image caption generation with visual attention,\u201d ICML, 2015."},{"key":"27","unstructured":"[27] J.B. Delbrouck and S. Dupont, \u201cMultimodal compact bilinear pooling for multimodal neural machine translation,\u201d CoRR, abs\/1703.08084, 2017."},{"key":"28","unstructured":"[28] J.H. Kim, K.W. On, W. Lim, J. Kim, J.W. Ha, and B.T. Zhang, \u201cHadamard product for low-rank bilinear pooling,\u201d arXiv preprint arXiv:1610.04325, 2016."},{"key":"29","doi-asserted-by":"crossref","unstructured":"[29] Z. Yu, J. Yu, J. Fan, and D. Tao, \u201cMulti-modal factorized bilinear pooling with co-attention learning for visual question answering,\u201d Proceedings of the IEEE international conference on computer vision, pp.1821-1830, 2017. 10.1109\/iccv.2017.202","DOI":"10.1109\/ICCV.2017.202"},{"key":"30","doi-asserted-by":"crossref","unstructured":"[30] K. He, X. Zhang, S. Ren, and J. Sun, \u201cDeep residual learning for image recognition,\u201d Proceedings of the IEEE conference on computer vision and pattern recognition, pp.770-778, 2016. 10.1109\/cvpr.2016.90","DOI":"10.1109\/CVPR.2016.90"},{"key":"31","unstructured":"[31] J. Ba, J. Kiros, and G.E. Hinton, \u201cLayer normalization,\u201d arXiv, abs\/1607.06450, 2016."},{"key":"32","doi-asserted-by":"crossref","unstructured":"[32] F. Liu, J. Liu, Z. Fang, R. Hong, and H. Lu, \u201cDensely connected attention flow for visual question answering,\u201d IJCAI, 2019. 10.24963\/ijcai.2019\/122","DOI":"10.24963\/ijcai.2019\/122"},{"key":"33","doi-asserted-by":"crossref","unstructured":"[33] G. Peng, H. You, Z. Zhang, X. Wang, and H. Li, \u201cMulti-modality latent interaction network for visual question answering,\u201d 2019 IEEE\/CVF International Conference on Computer Vision (ICCV), pp.5824-5834, 2019. 10.1109\/iccv.2019.00592","DOI":"10.1109\/ICCV.2019.00592"},{"key":"34","doi-asserted-by":"publisher","unstructured":"[34] T. Wang, H. Luo, X. Zeng, Z. Yu, A. Liu, and A.K. Sangaiah, \u201cMobility based trust evaluation for heterogeneous electric vehicles network in smart cities,\u201d IEEE Transactions on Intelligent Transportation Systems, vol.22, no.3, pp.1797-1806, 2020. 10.1109\/tits.2020.2997377","DOI":"10.1109\/TITS.2020.2997377"},{"key":"35","unstructured":"[35] A. Vaswani, N.M. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A.N. Gomez, L. Kaiser, and I. Polosukhin, \u201cAttention is all you need,\u201d arXiv, abs\/1706.03762, 2017."},{"key":"36","doi-asserted-by":"publisher","unstructured":"[36] R. Krishna, Y. Zhu, O. Groth, J. Johnson, K. Hata, J. Kravitz, S. Chen, Y. Kalantidis, L.-J. Li, D. Shamma, M.S. Bernstein, and L. Fei-Fei, \u201cVisual genome: Connecting language and vision using crowdsourced dense image annotations,\u201d International Journal of Computer Vision, vol.123, pp.32-73, 2016. 10.1007\/s11263-016-0981-7","DOI":"10.1007\/s11263-016-0981-7"},{"key":"37","doi-asserted-by":"crossref","unstructured":"[37] S. Hochreiter and J. Schmidhuber, \u201cLong short-term memory,\u201d Neural Computation, vol.9, no.8, pp.1735-1780, 1997. 10.1162\/neco.1997.9.8.1735","DOI":"10.1162\/neco.1997.9.8.1735"},{"key":"38","doi-asserted-by":"crossref","unstructured":"[38] T.-Y. Lin, M. Maire, S. Belongie, J. Hays, P. Perona, D. Ramanan, P. Doll\u00e1r, and C.L. Zitnick, \u201cMicrosoft coco: Common objects in context,\u201d ECCV, vol.8693, pp.740-755, 2014. 10.1007\/978-3-319-10602-1_48","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"39","doi-asserted-by":"publisher","unstructured":"[39] C. Chen, D. Han, and J. Wang, \u201cMultimodal encoder-decoder attention networks for visual question answering,\u201d IEEE Access, vol.8, pp.35662-35671, 2020. 10.1109\/access.2020.2975093","DOI":"10.1109\/ACCESS.2020.2975093"},{"key":"40","doi-asserted-by":"crossref","unstructured":"[40] S. He and D. Han, \u201cAn effective dense co-attention networks for visual question answering,\u201d Sensors (Basel, Switzerland), vol.20, no.17, 2020. 10.3390\/s20174897","DOI":"10.3390\/s20174897"}],"container-title":["IEICE Transactions on Information and Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/5\/E106.D_2022DLP0002\/_pdf","content-type":"unspecified","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2023,5,6]],"date-time":"2023-05-06T04:16:31Z","timestamp":1683346591000},"score":1,"resource":{"primary":{"URL":"https:\/\/www.jstage.jst.go.jp\/article\/transinf\/E106.D\/5\/E106.D_2022DLP0002\/_article"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,5,1]]},"references-count":40,"journal-issue":{"issue":"5","published-print":{"date-parts":[[2023]]}},"URL":"https:\/\/doi.org\/10.1587\/transinf.2022dlp0002","relation":{},"ISSN":["0916-8532","1745-1361"],"issn-type":[{"value":"0916-8532","type":"print"},{"value":"1745-1361","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,5,1]]},"article-number":"2022DLP0002"}}