{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,8]],"date-time":"2026-05-08T19:56:46Z","timestamp":1778270206021,"version":"3.51.4"},"reference-count":52,"publisher":"Springer Science and Business Media LLC","issue":"6","license":[{"start":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T00:00:00Z","timestamp":1689984000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T00:00:00Z","timestamp":1689984000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Multimed Tools Appl"],"DOI":"10.1007\/s11042-023-16151-w","type":"journal-article","created":{"date-parts":[[2023,7,22]],"date-time":"2023-07-22T09:01:38Z","timestamp":1690016498000},"page":"17281-17298","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Hierarchical Attention Networks for Fact-based Visual Question Answering"],"prefix":"10.1007","volume":"83","author":[{"given":"Haibo","family":"Yao","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Yongkang","family":"Luo","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"ORCID":"https:\/\/orcid.org\/0000-0001-8002-6486","authenticated-orcid":false,"given":"Zhi","family":"Zhang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Jianhang","family":"Yang","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Chengtao","family":"Cai","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,7,22]]},"reference":[{"key":"16151_CR1","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, Mitchell M, Batra D, Zitnick CL, Parikh D (2015) Vqa: Visual question answering. In: Proceedings of the IEEE International Conference on Computer Vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"16151_CR2","doi-asserted-by":"crossref","unstructured":"Wang P, Wu Q, Shen C, van\u00a0den Hengel A, Dick A (2017) Explicit knowledge based reasoning for visual question answering. In: Proceedings of the 26th International Joint Conference on Artificial Intelligence, pp 1290\u20131296","DOI":"10.24963\/ijcai.2017\/179"},{"issue":"10","key":"16151_CR3","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2018","unstructured":"Wang P, Wu Q, Shen C, Dick A, van den Hengel A (2018) Fvqa: Fact-based visual question answering. IEEE Transactions on Pattern Analysis and Machine Intelligence 40(10):2413\u20132427","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"16151_CR4","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107563","volume":"108","author":"J Yu","year":"2020","unstructured":"Yu J, Zhu Z, Wang Y, Zhang W, Hu Y, Tan J (2020) Cross-modal knowledge reasoning for knowledge-based visual question answering. Pattern Recognition 108:107563","journal-title":"Pattern Recognition"},{"issue":"1","key":"16151_CR5","doi-asserted-by":"publisher","first-page":"165","DOI":"10.1080\/21645515.2017.1379639","volume":"14","author":"UA Bhatti","year":"2018","unstructured":"Bhatti UA, Huang M, Wang H, Zhang Y, Mehmood A, Di W (2018) Recommendation system for immunization coverage and monitoring. Human Vaccines and Immunotherapeutics 14(1):165\u2013171","journal-title":"Human Vaccines and Immunotherapeutics"},{"key":"16151_CR6","doi-asserted-by":"crossref","unstructured":"Bhatti UA, Zeeshan Z, Nizamani MM, Bazai S, Yu Z, Yuan L (2022) Assessing the change of ambient air quality patterns in jiangsu province of china pre-to post-covid-19. Chemosphere 288","DOI":"10.1016\/j.chemosphere.2021.132569"},{"issue":"3","key":"16151_CR7","doi-asserted-by":"publisher","first-page":"329","DOI":"10.1080\/17517575.2018.1557256","volume":"13","author":"UA Bhatti","year":"2019","unstructured":"Bhatti UA, Huang M, Wu D, Zhang Y, Mehmood A, Han H (2019) Recommendation system using feature extraction and pattern recognition in clinical care systems. Enterprise Information Systems 13(3):329\u2013351","journal-title":"Enterprise Information Systems"},{"key":"16151_CR8","doi-asserted-by":"publisher","first-page":"76386","DOI":"10.1109\/ACCESS.2020.2988298","volume":"8","author":"UA Bhatti","year":"2020","unstructured":"Bhatti UA, Yu Z, Li J, Nawaz SA, Mehmood A, Zhang K, Yuan L (2020) Hybrid watermarking algorithm using clifford algebra with arnold scrambling and chaotic encryption. IEEE Access 8:76386\u201376398","journal-title":"IEEE Access"},{"key":"16151_CR9","doi-asserted-by":"crossref","unstructured":"Bhatti UA, Yu Z, Chanussot J, Zeeshan Z, Yuan L, Luo W, Nawaz SA, Bhatti MA, ul\u00a0Ain Q, Mehmood A (2022) Local similarity-based spatial-spectral fusion hyperspectral image classification with deep cnn and gabor filtering. IEEE Transactions on Geoscience and Remote Sensing 60","DOI":"10.1109\/TGRS.2021.3090410"},{"key":"16151_CR10","doi-asserted-by":"crossref","unstructured":"Wu Q, Wang P, Shen C, Dick A, van\u00a0den Hengel A (2016) Ask me anything:free-form visual question answering based on knowledge from external sources. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 4622\u20134630","DOI":"10.1109\/CVPR.2016.500"},{"key":"16151_CR11","unstructured":"Narasimhan M, Lazebnik S, Schwing AG (2018) Out of the box: Reasoning with graph convolution nets for factual visual question answering. In: Advances in Neural Information Processing Systems, pp 2654\u20132665"},{"key":"16151_CR12","doi-asserted-by":"crossref","unstructured":"Zhu Z, Yu J, Wang Y, Sun Y, Hu Y, Wu Q (2020) Mucko: Multi-layer cross-modal knowledge reasoning for fact-based visual question answering. In: International Joint Conference on Artificial Intelligence, pp 1097\u20131103","DOI":"10.24963\/ijcai.2020\/153"},{"key":"16151_CR13","unstructured":"Vaswani A, Shazeer N, Parmar N, Uszkoreit J, Jones L, Gomez AN, Kaiser L, Polosukhin I (2017) Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp 5999\u20136009"},{"key":"16151_CR14","unstructured":"Zhu Y, Zhang C, R\u00e9 C, Li FF (2015) Building a large-scale multimodal knowledge base system for answering visual queries. arXiv preprint http:\/\/arxiv.org\/abs\/1507.05670arXiv:1507.05670"},{"key":"16151_CR15","doi-asserted-by":"publisher","first-page":"193","DOI":"10.1162\/tacl_a_00220","volume":"1","author":"J Krishnamurthy","year":"2013","unstructured":"Krishnamurthy J, Kollar T (2013) Jointly learning to parse and perceive: Connecting natural language to the physical world. Transactions of the Association for Computational Linguistics 1:193\u2013206","journal-title":"Transactions of the Association for Computational Linguistics"},{"key":"16151_CR16","doi-asserted-by":"crossref","unstructured":"Narasimhan K, Yala A, Barzilay R (2016) Improving information extraction by acquiring external evidence with reinforcement learning. In: Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing, pp 2355\u20132365","DOI":"10.18653\/v1\/D16-1261"},{"key":"16151_CR17","doi-asserted-by":"crossref","unstructured":"Gard\u00e8res F, Ziaeefard M, Abeloos B, Lecue F (2020) Conceptbert: Concept-aware representation for visual question answering. In: Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, pp 489\u2013498","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"16151_CR18","doi-asserted-by":"crossref","unstructured":"Marino K, Chen X, Parikh D, Gupta A, Rohrbach M (2021) Krisp: Integrating implicit and symbolic knowledge for open-domain knowledge-based vqa. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14106\u201314116","DOI":"10.1109\/CVPR46437.2021.01389"},{"key":"16151_CR19","doi-asserted-by":"crossref","unstructured":"Wu J, Lu J, Sabharwal A, Mottaghi R (2022) Multi-modal answer validation for knowledge-based vqa. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 2712\u20132721","DOI":"10.1609\/aaai.v36i3.20174"},{"key":"16151_CR20","doi-asserted-by":"crossref","unstructured":"Medhini N, Schwing AG (2018) Straight to the facts: Learning knowledge base retrieval for factual visual question answering. In: Proceedings of the European Conference on Computer Vision, pp 460\u2013477","DOI":"10.1007\/978-3-030-01237-3_28"},{"key":"16151_CR21","unstructured":"Bahdanau D, Cho K, Bengio Y (2015) Neural machine translation by jointly learning to align and translate. In: International Conference on Learning Representations"},{"key":"16151_CR22","doi-asserted-by":"crossref","unstructured":"Zhang S, Feng Y (2022) Gaussian multi-head attention for simultaneous machine translation. In: Annual Meeting of the Association-for-Computational-Linguistics, pp 3019\u20133030","DOI":"10.18653\/v1\/2022.findings-acl.238"},{"issue":"2","key":"16151_CR23","doi-asserted-by":"publisher","first-page":"873","DOI":"10.1109\/TCYB.2020.2988093","volume":"52","author":"J Li","year":"2022","unstructured":"Li J, Pan Z, Liu Q, Cui Y, Sun Y (2022) Complementarity-aware attention network for salient object detection. IEEE Transactions on Cybernetics 52(2):873\u2013886","journal-title":"IEEE Transactions on Cybernetics"},{"key":"16151_CR24","doi-asserted-by":"crossref","unstructured":"Liu S, Zhang L, Lu H, He Y (2022) Center-boundary dual attention for oriented object detection in remote sensing images. IEEE Transactions on Geoscience and Remote Sensing 60","DOI":"10.1109\/TGRS.2021.3069056"},{"issue":"4","key":"16151_CR25","doi-asserted-by":"publisher","first-page":"1073","DOI":"10.1007\/s11263-023-01752-7","volume":"131","author":"X Wu","year":"2023","unstructured":"Wu X, Li T (2023) Sentimental visual captioning using multimodal transformer. International Journal of Computer Vision 131(4):1073\u20131090","journal-title":"International Journal of Computer Vision"},{"key":"16151_CR26","doi-asserted-by":"crossref","unstructured":"Wang W, Bao H, Dong L, Bjorck J, Peng Z, Liu Q, Aggarwal K, Mohammed OK, Singhal S, Som S, Wei F (2022) Image as a foreign language: Beit pretraining for all vision and vision-language tasks. arXiv preprint http:\/\/arxiv.org\/abs\/2208.10442arXiv:2208.10442","DOI":"10.1109\/CVPR52729.2023.01838"},{"key":"16151_CR27","unstructured":"Bao H, Wang W, Dong L, Liu Q, Mohammed OK, Aggarwal K, Som S, Wei F (2022) Vlmo: Unified vision-language pre-training with mixture-of-modality-experts. arXiv preprint http:\/\/arxiv.org\/abs\/2111.02358arXiv:2111.02358"},{"key":"16151_CR28","unstructured":"Li J, Li D, Xiong C, Hoi S, Chaudhuri K, Jegelka S, Song L, Szepesvari C, Niu G, Sabato S (2022) Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation. In: International Conference on Machine Learning"},{"key":"16151_CR29","doi-asserted-by":"crossref","unstructured":"Yu Z, Yu J, Cui Y, Tao D, Tian Q (2019) Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6274\u20136283","DOI":"10.1109\/CVPR.2019.00644"},{"issue":"1","key":"16151_CR30","doi-asserted-by":"publisher","first-page":"318","DOI":"10.1109\/TPAMI.2020.3004830","volume":"44","author":"L Peng","year":"2022","unstructured":"Peng L, Yang Y, Wang Z, Huang Z, Shen HT (2022) Mra-net: Improving vqa via multi-modal relation attention network. IEEE Transactions on Pattern Analysis and Machine Intelligence 44(1):318\u2013329","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"16151_CR31","unstructured":"Nickel M, Tresp V, Kriegel HP (2011) A three-way model for collective learning on multi-relational data. In: Proceedings of the 28th International Conference on Machine Learning, pp 809\u2013816"},{"key":"16151_CR32","unstructured":"Jenatton R, Le\u00a0Roux N, Bordes A, Obozinski G (2012) A latent factor model for highly multi-relational data. In: Advances in Neural Information Processing Systems, pp 3167\u20133175"},{"key":"16151_CR33","unstructured":"Yang B, Yih Wt, He X, Gao J, Deng L (2015) Embedding entities and relations for learning and inference in knowledge bases. In: International Conference on Learning Representations"},{"key":"16151_CR34","doi-asserted-by":"crossref","unstructured":"Nickel M, Rosasco L, Poggio T (2016) Holographic embeddings of knowledge graphs. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 1955\u20131961","DOI":"10.1609\/aaai.v30i1.10314"},{"key":"16151_CR35","unstructured":"Bordes A, Usunier N, Garcia-Duran A, Weston J, Yakhnenko O (2013) Translating embeddings for modeling multi-relational data. In: Advances in Neural Information Processing Systems"},{"key":"16151_CR36","doi-asserted-by":"crossref","unstructured":"Wang Z, Zhang J, Feng J, Chen Z (2014) Knowledge graph embedding by translating on hyperplanes. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 1112\u20131119","DOI":"10.1609\/aaai.v28i1.8870"},{"key":"16151_CR37","doi-asserted-by":"crossref","unstructured":"Lin Y, Liu ZLMSY, Zhu X (2015) Learning entity and relation embeddings for knowledge graph completion. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 2181\u20132187","DOI":"10.1609\/aaai.v29i1.9491"},{"key":"16151_CR38","doi-asserted-by":"crossref","unstructured":"Goel R, Kazemi SM, Brubaker M, Poupart P (2020) Diachronic embedding for temporal knowledge graph completion. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 3988\u20133995","DOI":"10.1609\/aaai.v34i04.5815"},{"key":"16151_CR39","doi-asserted-by":"crossref","unstructured":"Gupta S, Kenkre S, Talukdar P (2019) Care: Open knowledge graph embeddings. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing, pp 378\u2013388","DOI":"10.18653\/v1\/D19-1036"},{"key":"16151_CR40","doi-asserted-by":"crossref","unstructured":"Malaviya C, Bhagavatula C, Bosselut A, Choi Y (2020) Commonsense knowledge base completion with structural and semantic context. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 2925\u20132933","DOI":"10.1609\/aaai.v34i03.5684"},{"key":"16151_CR41","unstructured":"Rammnath K, Hasegawa-Johnson M (2020) Seeing is knowing! fact-based visual question answering using knowledge graph embeddings. arXiv preprint http:\/\/arxiv.org\/abs\/2012.15484arXiv:2012.15484"},{"key":"16151_CR42","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, Teney D, Johnson M, Gould S, Zhang L (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"issue":"6","key":"16151_CR43","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2017","unstructured":"Ren S, He K, Girshick R, Sun J (2017) Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence 39(6):1137\u20131149","journal-title":"IEEE Transactions on Pattern Analysis and Machine Intelligence"},{"key":"16151_CR44","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing, pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"issue":"8","key":"16151_CR45","doi-asserted-by":"publisher","first-page":"1735","DOI":"10.1162\/neco.1997.9.8.1735","volume":"9","author":"S Hochreiter","year":"1997","unstructured":"Hochreiter S, Schmidhuber J (1997) Long short-term memory. Neural Computation 9(8):1735\u20131780","journal-title":"Neural Computation"},{"key":"16151_CR46","doi-asserted-by":"crossref","unstructured":"Tandon N, de\u00a0Melo\u00a0andFabian Suchanek G, Weikum G (2014) Webchild: Harvesting and organizing commonsense knowledge from the web. In: Proceedings of the 7th ACM International Conference on Web Search and Data Mining, pp 523\u2013532","DOI":"10.1145\/2556195.2556245"},{"key":"16151_CR47","doi-asserted-by":"crossref","unstructured":"Speer R, Chin J, Havasi C (2017) Conceptnet 5.5: An open multilingual graph of general knowledge. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 4444\u20134451","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"16151_CR48","doi-asserted-by":"crossref","unstructured":"Auer S, Bizer C, Kobilarov G, Lehmann J, Cyganiak R, Ives Z (2007) Dbpedia: A nucleus for a web of open data. In: The Semantic Web, pp 722\u2013735","DOI":"10.1007\/978-3-540-76298-0_52"},{"key":"16151_CR49","doi-asserted-by":"crossref","unstructured":"Guo Y, Nie L, Wong Y, Liu Y, Cheng Z, Kankanhalli M (2022) A unified end-to-end retriever-reader framework for knowledge-based vqa. In: Proceedings of the 30th ACM International Conference on Multimedia, pp 2061\u20132069","DOI":"10.1145\/3503161.3547870"},{"key":"16151_CR50","doi-asserted-by":"crossref","unstructured":"Salemi A, Pizzorno JA, Zamani H (2023) A symmetric dual encoding dense retrieval framework for knowledge-intensive visual question answering. arXiv preprint http:\/\/arxiv.org\/abs\/2304.13649arXiv:2304.13649","DOI":"10.1145\/3539618.3591629"},{"key":"16151_CR51","doi-asserted-by":"crossref","unstructured":"Li H, Wang P, Shen C, van\u00a0den Hengel A (2019) Visual question answering as reading comprehension. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 6312\u20136321","DOI":"10.1109\/CVPR.2019.00648"},{"key":"16151_CR52","doi-asserted-by":"crossref","unstructured":"Liu L, Wang M, He X, Qing L, Chen H (2022) Fact-based visual question answering via dual-process system. Knowledge-based Systems 237","DOI":"10.1016\/j.knosys.2021.107650"}],"container-title":["Multimedia Tools and Applications"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-16151-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s11042-023-16151-w\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s11042-023-16151-w.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,1,31]],"date-time":"2024-01-31T08:57:16Z","timestamp":1706691436000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s11042-023-16151-w"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,7,22]]},"references-count":52,"journal-issue":{"issue":"6","published-online":{"date-parts":[[2024,2]]}},"alternative-id":["16151"],"URL":"https:\/\/doi.org\/10.1007\/s11042-023-16151-w","relation":{},"ISSN":["1573-7721"],"issn-type":[{"value":"1573-7721","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,7,22]]},"assertion":[{"value":"7 January 2022","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"17 May 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"3 July 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"22 July 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"The authors declare that they have no known competing financial interests or personal relationships that could have appeared to influence the work reported in this paper.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}