{"status":"ok","message-type":"work","message-version":"1.0.0","message":{"indexed":{"date-parts":[[2026,5,7]],"date-time":"2026-05-07T16:25:43Z","timestamp":1778171143696,"version":"3.51.4"},"reference-count":49,"publisher":"Springer Science and Business Media LLC","issue":"3","license":[{"start":{"date-parts":[[2023,12,14]],"date-time":"2023-12-14T00:00:00Z","timestamp":1702512000000},"content-version":"tdm","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"},{"start":{"date-parts":[[2023,12,14]],"date-time":"2023-12-14T00:00:00Z","timestamp":1702512000000},"content-version":"vor","delay-in-days":0,"URL":"https:\/\/www.springernature.com\/gp\/researchers\/text-and-data-mining"}],"funder":[{"DOI":"10.13039\/501100001809","name":"National Natural Science Foundation of China","doi-asserted-by":"publisher","award":["62162010"],"award-info":[{"award-number":["62162010"]}],"id":[{"id":"10.13039\/501100001809","id-type":"DOI","asserted-by":"publisher"}]},{"name":"Guizhou Science and Technology Support Program","award":["Qiankehe Support [2022] General 267"],"award-info":[{"award-number":["Qiankehe Support [2022] General 267"]}]}],"content-domain":{"domain":["link.springer.com"],"crossmark-restriction":false},"short-container-title":["Knowl Inf Syst"],"published-print":{"date-parts":[[2024,3]]},"DOI":"10.1007\/s10115-023-02028-9","type":"journal-article","created":{"date-parts":[[2023,12,14]],"date-time":"2023-12-14T20:02:17Z","timestamp":1702584137000},"page":"2193-2208","update-policy":"https:\/\/doi.org\/10.1007\/springer_crossmark_policy","source":"Crossref","is-referenced-by-count":4,"title":["Knowledge enhancement and scene understanding for knowledge-based visual question answering"],"prefix":"10.1007","volume":"66","author":[{"given":"Zhenqiang","family":"Su","sequence":"first","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]},{"given":"Gang","family":"Gou","sequence":"additional","affiliation":[],"role":[{"role":"author","vocabulary":"crossref"}]}],"member":"297","published-online":{"date-parts":[[2023,12,14]]},"reference":[{"key":"2028_CR1","doi-asserted-by":"crossref","unstructured":"Anderson P, He X, Buehler C, et\u00a0al (2018) Bottom-up and top-down attention for image captioning and visual question answering. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 6077\u20136086","DOI":"10.1109\/CVPR.2018.00636"},{"key":"2028_CR2","doi-asserted-by":"crossref","unstructured":"Antol S, Agrawal A, Lu J, et\u00a0al (2015) Vqa: Visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2425\u20132433","DOI":"10.1109\/ICCV.2015.279"},{"key":"2028_CR3","doi-asserted-by":"crossref","unstructured":"Ben-Younes H, Cadene R, Cord M, et\u00a0al (2017) Mutan: Multimodal tucker fusion for visual question answering. In: Proceedings of the IEEE international conference on computer vision, pp 2612\u20132620","DOI":"10.1109\/ICCV.2017.285"},{"key":"2028_CR4","unstructured":"Bordes A, Usunier N, Garcia-Dur\u00e1n A, et\u00a0al (2013) Translating embeddings for modeling multi-relational data. In: Proceedings of the 26th International Conference on Neural Information Processing Systems-Volume 2, pp 2787\u20132795"},{"key":"2028_CR5","unstructured":"Chen K, Wang J, Chen LC, et\u00a0al (2015) Abc-cnn: An attention based convolutional neural network for visual question answering. arXiv preprint arXiv:1511.05960"},{"key":"2028_CR6","doi-asserted-by":"crossref","unstructured":"Ding Y, Yu J, Liu B, et\u00a0al (2022) Mukea: Multimodal knowledge extraction and accumulation for knowledge-based visual question answering. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5089\u20135098","DOI":"10.1109\/CVPR52688.2022.00503"},{"key":"2028_CR7","doi-asserted-by":"crossref","unstructured":"Fukui A, Park DH, Yang D, et\u00a0al (2016) Multimodal compact bilinear pooling for visual question answering and visual grounding. In: Conference on Empirical Methods in Natural Language Processing, ACL, pp 457\u2013468","DOI":"10.18653\/v1\/D16-1044"},{"key":"2028_CR8","doi-asserted-by":"crossref","unstructured":"Gard\u00e8res F, Ziaeefard M, Abeloos B et al (2020) Conceptbert: Concept-aware representation for visual question answering. Findings of the Association for Computational Linguistics: EMNLP 2020:489\u2013498","DOI":"10.18653\/v1\/2020.findings-emnlp.44"},{"key":"2028_CR9","doi-asserted-by":"crossref","unstructured":"He K, Zhang X, Ren S, et\u00a0al (2016) Deep residual learning for image recognition. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 770\u2013778","DOI":"10.1109\/CVPR.2016.90"},{"key":"2028_CR10","doi-asserted-by":"crossref","unstructured":"Hwang JD, Bhagavatula C, Le\u00a0Bras R, et\u00a0al (2021) (comet-) atomic 2020: On symbolic and neural commonsense knowledge graphs. In: Proceedings of the AAAI Conference on Artificial Intelligence, pp 6384\u20136392","DOI":"10.1609\/aaai.v35i7.16792"},{"key":"2028_CR11","doi-asserted-by":"crossref","unstructured":"Joshi V, Peters ME, Hopkins M (2018) Extending a parser to distant domains using a few dozen partially annotated examples. In: Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pp 1190\u20131199","DOI":"10.18653\/v1\/P18-1110"},{"key":"2028_CR12","doi-asserted-by":"crossref","unstructured":"Kannan AV, Fradkin D, Akrotirianakis I, et\u00a0al (2020) Multimodal knowledge graph for deep learning papers and code. In: Proceedings of the 29th ACM International Conference on Information & Knowledge Management, pp 3417\u20133420","DOI":"10.1145\/3340531.3417439"},{"key":"2028_CR13","unstructured":"Kim JH, Jun J, Zhang BT (2018) Bilinear attention networks. In: Proceedings of the 32nd International Conference on Neural Information Processing Systems, pp 1571\u20131581"},{"key":"2028_CR14","doi-asserted-by":"crossref","unstructured":"Li M, Zareian A, Lin Y, et\u00a0al (2020a) Gaia: A fine-grained multimedia knowledge extraction system. In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations, pp 77\u201386","DOI":"10.18653\/v1\/2020.acl-demos.11"},{"key":"2028_CR15","doi-asserted-by":"crossref","unstructured":"Li X, Yin X, Li C, et\u00a0al (2020b) Oscar: Object-semantics aligned pre-training for vision-language tasks. In: European Conference on Computer Vision, Springer, pp 121\u2013137","DOI":"10.1007\/978-3-030-58577-8_8"},{"key":"2028_CR16","doi-asserted-by":"crossref","unstructured":"Lin TY, Maire M, Belongie S, et\u00a0al (2014) Microsoft coco: Common objects in context. In: European conference on computer vision, Springer, pp 740\u2013755","DOI":"10.1007\/978-3-319-10602-1_48"},{"key":"2028_CR17","unstructured":"Loshchilov I, Hutter F (2018) Fixing weight decay regularization in adam. In: International Conference on Learning Representations"},{"key":"2028_CR18","unstructured":"Lu J, Batra D, Parikh D, et\u00a0al (2019) Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks. Advances in neural information processing systems 32"},{"key":"2028_CR19","doi-asserted-by":"crossref","unstructured":"Malinowski M, Rohrbach M, Fritz M (2015) Ask your neurons: A neural-based approach to answering questions about images. In: Proceedings of the IEEE international conference on computer vision, pp 1\u20139","DOI":"10.1109\/ICCV.2015.9"},{"key":"2028_CR20","unstructured":"Manola F, Miller E, McBride B, et\u00a0al (2004) Rdf primer. W3C recommendation 10(1-107):6"},{"key":"2028_CR21","doi-asserted-by":"crossref","unstructured":"Marino K, Rastegari M, Farhadi A, et\u00a0al (2019) Ok-vqa: A visual question answering benchmark requiring external knowledge. In: Proceedings of the IEEE\/cvf conference on computer vision and pattern recognition, pp 3195\u20133204","DOI":"10.1109\/CVPR.2019.00331"},{"key":"2028_CR22","doi-asserted-by":"crossref","unstructured":"Marino K, Chen X, Parikh D, et\u00a0al (2021) Krisp: Integrating implicit and symbolic knowledge for open-domain knowledge-based vqa. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 14111\u201314121","DOI":"10.1109\/CVPR46437.2021.01389"},{"key":"2028_CR23","unstructured":"Mikolov T, Chen K, Corrado G, et\u00a0al (2013) Efficient estimation of word representations in vector space. arXiv preprint arXiv:1301.3781"},{"key":"2028_CR24","unstructured":"Mokady R, Hertz A, Bermano AH (2021) Clipcap: Clip prefix for image captioning. arXiv preprint arXiv:2111.09734"},{"key":"2028_CR25","doi-asserted-by":"crossref","unstructured":"Patashnik O, Wu Z, Shechtman E, et\u00a0al (2021) Styleclip: Text-driven manipulation of stylegan imagery. In: Proceedings of the IEEE\/CVF International Conference on Computer Vision, pp 2085\u20132094","DOI":"10.1109\/ICCV48922.2021.00209"},{"key":"2028_CR26","doi-asserted-by":"crossref","unstructured":"Pennington J, Socher R, Manning CD (2014) Glove: Global vectors for word representation. In: Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP), pp 1532\u20131543","DOI":"10.3115\/v1\/D14-1162"},{"key":"2028_CR27","doi-asserted-by":"crossref","unstructured":"Pezeshkpour P, Chen L, Singh S (2018) Embedding multimodal relational data for knowledge base completion. In: Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pp 3208\u20133218","DOI":"10.18653\/v1\/D18-1359"},{"key":"2028_CR28","unstructured":"Radford A, Kim JW, Hallacy C, et\u00a0al (2021) Learning transferable visual models from natural language supervision. In: International Conference on Machine Learning, PMLR, pp 8748\u20138763"},{"key":"2028_CR29","doi-asserted-by":"crossref","unstructured":"Redmon J, Divvala S, Girshick R, et\u00a0al (2016) You only look once: Unified, real-time object detection. In: Proceedings of the IEEE conference on computer vision and pattern recognition, pp 779\u2013788","DOI":"10.1109\/CVPR.2016.91"},{"key":"2028_CR30","doi-asserted-by":"crossref","unstructured":"Reimers N (2020) sentence embeddings using siamese bert-networks. emnlp-ijcnlp 2019\u20132019 conf empir methods nat lang process 9th int jt conf nat lang process proc conf; 2019: 3982-3992","DOI":"10.18653\/v1\/D19-1410"},{"issue":"2","key":"2028_CR31","first-page":"5","volume":"1","author":"M Ren","year":"2015","unstructured":"Ren M, Kiros R, Zemel R (2015) Image question answering: A visual semantic embedding model and a new dataset. Proc Advances in Neural Inf Process Syst 1(2):5","journal-title":"Proc Advances in Neural Inf Process Syst"},{"issue":"6","key":"2028_CR32","doi-asserted-by":"publisher","first-page":"1137","DOI":"10.1109\/TPAMI.2016.2577031","volume":"39","author":"S Ren","year":"2016","unstructured":"Ren S, He K, Girshick R et al (2016) Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Trans Pattern Anal Mach Intell 39(6):1137\u20131149","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2028_CR33","doi-asserted-by":"crossref","unstructured":"Sap M, Le\u00a0Bras R, Allaway E, et\u00a0al (2019) Atomic: An atlas of machine commonsense for if-then reasoning. In: Proceedings of the AAAI conference on artificial intelligence, pp 3027\u20133035","DOI":"10.1609\/aaai.v33i01.33013027"},{"key":"2028_CR34","doi-asserted-by":"crossref","unstructured":"Sharma H, Jalal AS (2021) Image captioning improved visual question answering. Multimedia tools and applications pp 1\u201322","DOI":"10.1007\/s11042-021-11276-2"},{"key":"2028_CR35","unstructured":"Shen S, Li LH, Tan H, et\u00a0al (2021) How much can clip benefit vision-and-language tasks? arXiv preprint arXiv:2107.06383"},{"key":"2028_CR36","doi-asserted-by":"crossref","unstructured":"Speer R, Chin J, Havasi C (2017) ConceptNet 5.5: an open multilingual graph of general knowledge. In: Proceedings of the thirty-first AAAI conference on artificial intelligence. 2017 presented at: AAAI\u201917: thirty-first AAAI conference on artificial intelligence","DOI":"10.1609\/aaai.v31i1.11164"},{"key":"2028_CR37","unstructured":"Su Z, Gou G (2022) Visual question answering research on joint knowledge and visual information reasoning. Computer Engineering and Applications"},{"key":"2028_CR38","doi-asserted-by":"crossref","unstructured":"Sun R, Cao X, Zhao Y, et\u00a0al (2020) Multi-modal knowledge graphs for recommender systems. In: Proceedings of the 29th ACM international conference on information & knowledge management, pp 1405\u20131414","DOI":"10.1145\/3340531.3411947"},{"key":"2028_CR39","doi-asserted-by":"crossref","unstructured":"Tan H, Bansal M (2019) Lxmert: Learning cross-modality encoder representations from transformers. In: Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP), pp 5100\u20135111","DOI":"10.18653\/v1\/D19-1514"},{"key":"2028_CR40","unstructured":"Vaswani A, Shazeer N, Parmar N, et\u00a0al (2017) Attention is all you need. In: Proceedings of the 31st International Conference on Neural Information Processing Systems, pp 6000\u20136010"},{"issue":"10","key":"2028_CR41","doi-asserted-by":"publisher","first-page":"78","DOI":"10.1145\/2629489","volume":"57","author":"D Vrande\u010di\u0107","year":"2014","unstructured":"Vrande\u010di\u0107 D, Kr\u00f6tzsch M (2014) Wikidata: a free collaborative knowledgebase. Commun ACM 57(10):78\u201385","journal-title":"Commun ACM"},{"key":"2028_CR42","unstructured":"Wang G, Zhu M, Xu C, et\u00a0al (2021) Exploiting image captions and external knowledge as representation enhancement for visual question answering. In: Proceedings of the 20th Chinese National Conference on Computational Linguistics, pp 316\u2013326"},{"key":"2028_CR43","doi-asserted-by":"crossref","unstructured":"Wang P, Wu Q, Shen C, et\u00a0al (2017a) Explicit knowledge-based reasoning for visual question answering. In: Proceedings of the 26th International Joint Conference on Artificial Intelligence, pp 1290\u20131296","DOI":"10.24963\/ijcai.2017\/179"},{"issue":"10","key":"2028_CR44","doi-asserted-by":"publisher","first-page":"2413","DOI":"10.1109\/TPAMI.2017.2754246","volume":"40","author":"P Wang","year":"2017","unstructured":"Wang P, Wu Q, Shen C et al (2017) Fvqa: Fact-based visual question answering. IEEE Trans Pattern Anal Mach Intell 40(10):2413\u20132427","journal-title":"IEEE Trans Pattern Anal Mach Intell"},{"key":"2028_CR45","unstructured":"Wang P, Yang A, Men R, et\u00a0al (2022) Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework. In: International Conference on Machine Learning, PMLR, pp 23318\u201323340"},{"key":"2028_CR46","doi-asserted-by":"publisher","DOI":"10.1016\/j.patcog.2020.107563","volume":"108","author":"J Yu","year":"2020","unstructured":"Yu J, Zhu Z, Wang Y et al (2020) Cross-modal knowledge reasoning for knowledge-based visual question answering. Pattern Recogn 108:107563","journal-title":"Pattern Recogn"},{"key":"2028_CR47","doi-asserted-by":"crossref","unstructured":"Yu Z, Yu J, Cui Y, et\u00a0al (2019) Deep modular co-attention networks for visual question answering. In: Proceedings of the IEEE\/CVF conference on computer vision and pattern recognition, pp 6281\u20136290","DOI":"10.1109\/CVPR.2019.00644"},{"key":"2028_CR48","doi-asserted-by":"crossref","unstructured":"Zhang P, Li X, Hu X, et\u00a0al (2021) Vinvl: Revisiting visual representations in vision-language models. In: Proceedings of the IEEE\/CVF Conference on Computer Vision and Pattern Recognition, pp 5579\u20135588","DOI":"10.1109\/CVPR46437.2021.00553"},{"key":"2028_CR49","doi-asserted-by":"crossref","unstructured":"Zhu Z, Yu J, Wang Y, et\u00a0al (2021) Mucko: multi-layer cross-modal knowledge reasoning for fact-based visual question answering. In: Proceedings of the Twenty-Ninth International Conference on International Joint Conferences on Artificial Intelligence, pp 1097\u20131103","DOI":"10.24963\/ijcai.2020\/153"}],"container-title":["Knowledge and Information Systems"],"original-title":[],"language":"en","link":[{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-023-02028-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/article\/10.1007\/s10115-023-02028-9\/fulltext.html","content-type":"text\/html","content-version":"vor","intended-application":"text-mining"},{"URL":"https:\/\/link.springer.com\/content\/pdf\/10.1007\/s10115-023-02028-9.pdf","content-type":"application\/pdf","content-version":"vor","intended-application":"similarity-checking"}],"deposited":{"date-parts":[[2024,2,6]],"date-time":"2024-02-06T11:23:37Z","timestamp":1707218617000},"score":1,"resource":{"primary":{"URL":"https:\/\/link.springer.com\/10.1007\/s10115-023-02028-9"}},"subtitle":[],"short-title":[],"issued":{"date-parts":[[2023,12,14]]},"references-count":49,"journal-issue":{"issue":"3","published-print":{"date-parts":[[2024,3]]}},"alternative-id":["2028"],"URL":"https:\/\/doi.org\/10.1007\/s10115-023-02028-9","relation":{},"ISSN":["0219-1377","0219-3116"],"issn-type":[{"value":"0219-1377","type":"print"},{"value":"0219-3116","type":"electronic"}],"subject":[],"published":{"date-parts":[[2023,12,14]]},"assertion":[{"value":"6 February 2023","order":1,"name":"received","label":"Received","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"29 April 2023","order":2,"name":"revised","label":"Revised","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"13 November 2023","order":3,"name":"accepted","label":"Accepted","group":{"name":"ArticleHistory","label":"Article History"}},{"value":"14 December 2023","order":4,"name":"first_online","label":"First Online","group":{"name":"ArticleHistory","label":"Article History"}},{"order":1,"name":"Ethics","group":{"name":"EthicsHeading","label":"Declarations"}},{"value":"We declare that we have no conflict of interest.","order":2,"name":"Ethics","group":{"name":"EthicsHeading","label":"Conflicts of interest"}}]}}